/[pdpsoft]/trunk/nagios/glexec/check_glexec.sh
ViewVC logotype

Diff of /trunk/nagios/glexec/check_glexec.sh

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

trunk/nagios/glexec/check_glexec revision 2453 by msalle, Tue Nov 29 16:22:24 2011 UTC trunk/nagios/glexec/check_glexec.sh revision 2454 by msalle, Sun Dec 4 12:51:33 2011 UTC
# Line 1  Line 1 
1  #!/bin/sh  #!/bin/dash
2  #  #
3  # Copyright (C) Nikhef 2011  # Copyright (C) Nikhef 2011
4  #  #
# Line 54  VERBOSE=0 Line 54  VERBOSE=0
54  # Default timeout  # Default timeout
55  TIMEOUT=10  TIMEOUT=10
56  # Default short timeout, longer than this results in warning  # Default short timeout, longer than this results in warning
57  SHORTTIMEOUT=5  CRITTIMEOUT=8
58    # Default short timeout, longer than this results in warning
59    WARNTIMEOUT=5
60  # Default GLEXEC_CLIENT_CERT will be set after arguments are parsed  # Default GLEXEC_CLIENT_CERT will be set after arguments are parsed
61  GLEXEC_CLIENT_CERT=""  GLEXEC_CLIENT_CERT=""
62    
# Line 76  usage()        { Line 78  usage()        {
78      echo      echo
79      echo "Options:"      echo "Options:"
80      echo " -t|--timeout <timeout>          maximum runtime for probe, default $TIMEOUT sec"      echo " -t|--timeout <timeout>          maximum runtime for probe, default $TIMEOUT sec"
81      echo " -s|--shorttimeout <timeout>     runtime after which to warn, default $SHORTTIMEOUT sec"      echo " -w|--warning <timeout>          runtime after which to warn, default $WARNTIMEOUT sec"
82        echo " -c|--critical <timeout>         runtime after which to probe is to be killed, default $CRITTIMEOUT sec"
83      echo " -x|--x509-user-proxy <file>     set X509_USER_PROXY to given file"      echo " -x|--x509-user-proxy <file>     set X509_USER_PROXY to given file"
84      echo " -g|--glexec-client-cert <file>  set GLEXEC_CLIENT_CERT to given file"      echo " -g|--glexec-client-cert <file>  set GLEXEC_CLIENT_CERT to given file"
85      echo "                                 default: value of variable X509_USER_PROXY"      echo "                                 default: value of variable X509_USER_PROXY"
# Line 100  log()  { Line 103  log()  {
103  }  }
104    
105  # Prints nagios status line: <stat>: <summary>  # Prints nagios status line: <stat>: <summary>
106  nagios_printout()   {  nagios_status()   {
107      code=$1      code=$1
108      shift      shift
109      summary="$*"      summary="$*"
# Line 111  nagios_printout()   { Line 114  nagios_printout()   {
114          3)  stat='UNKNOWN' ;;          3)  stat='UNKNOWN' ;;
115          *)  stat='INVALID NAGIOS CODE $1' ;;          *)  stat='INVALID NAGIOS CODE $1' ;;
116      esac      esac
117      log 0 "$stat: $summary"      log 0 "NAGIOS_STATUS_LINE $code $stat: $summary"
118        exit $code
119    }
120    
121    # Prints last line first, then rest. Last line is supposed to contain
122    #  NAGIOS_STATUS_LINE  exitcode text
123    nagios_output()   {
124        logstring=""
125        nagiosline=""
126        code=0
127        while read line;do
128            line2=${line##NAGIOS_STATUS_LINE }
129            if [ "$line2" = "$line" ];then
130                if [ -z "$logstring" ];then
131                    logstring="${line}"
132                else
133                    logstring="${logstring}\n${line}"
134                fi
135            else
136                code=`echo $line2|cut -d' ' -f1`
137                nagiosline=`echo $line2|cut -d' ' -f2-`
138            fi
139        done
140        if [ -z "$nagiosline" ];then
141            printf "${logstring}"
142        else
143            printf "${nagiosline}\n${logstring}"
144        fi
145      exit $code      exit $code
146  }  }
147    
# Line 123  parse_err()    { Line 153  parse_err()    {
153          [ -z "$summary" ] && summary="$line"          [ -z "$summary" ] && summary="$line"
154          log 2 "$line"          log 2 "$line"
155      done      done
156      nagios_printout $code "$summary"      nagios_status $code "$summary"
157  }  }
158    
159  # don't use builtin which since it might not exist  # don't use builtin which since it might not exist
# Line 161  parse_short_args()     { Line 191  parse_short_args()     {
191          case "$i" in          case "$i" in
192              g)  GLEXEC_CLIENT_CERT="$OPTARG" ;;              g)  GLEXEC_CLIENT_CERT="$OPTARG" ;;
193              x)  X509_USER_PROXY="$OPTARG" ;;              x)  X509_USER_PROXY="$OPTARG" ;;
194              c|t) TIMEOUT=`parse_range "$OPTARG"`              t) TIMEOUT=`parse_range "$OPTARG"`
195                  if [ -z "$TIMEOUT" ];then                  if [ -z "$TIMEOUT" ];then
196                      parse_err "Not a valid timeout: \"$OPTARG\""                      parse_err "Not a valid timeout: \"$OPTARG\""
197                  else                  else
198                      log 2 "TIMEOUT set to $TIMEOUT"                      log 2 "TIMEOUT set to $TIMEOUT"
199                  fi                  fi
200                  ;;                  ;;
201              w)  SHORTTIMEOUT=`parse_range "$OPTARG"`              c) CRITTIMEOUT=`parse_range "$OPTARG"`
202                  if [ -z "$SHORTTIMEOUT" ];then                  if [ -z "$CRITTIMEOUT" ];then
203                        parse_err "Not a valid timeout: \"$OPTARG\""
204                    else
205                        log 2 "CRITTIMEOUT set to $CRITTIMEOUT"
206                    fi
207                    ;;
208                w)  WARNTIMEOUT=`parse_range "$OPTARG"`
209                    if [ -z "$WARNTIMEOUT" ];then
210                      parse_err "Not a valid timeout: \"$OPTARG\""                      parse_err "Not a valid timeout: \"$OPTARG\""
211                  else                  else
212                      log 2 "SHORTTIMEOUT set to $TIMEOUT"                      log 2 "WARNTIMEOUT set to $WARNTIMEOUT"
213                  fi                  fi
214                  ;;                  ;;
215              e)  GLEXEC_CMD_EXE="$OPTARG" ;;              e)  GLEXEC_CMD_EXE="$OPTARG" ;;
216              h)  shortusage ;;              h)  shortusage ;;
217              V)  nagios_printout 0 "$PROG version $VERSION" ;;              V)  nagios_status 0 "$PROG version $VERSION" ;;
218              v)  VERBOSE=`expr $VERBOSE + 1` ;;              v)  VERBOSE=`expr $VERBOSE + 1` ;;
219              H)  log 2 "$PROG: option -H/--hostname is not used" ;;              H)  log 2 "$PROG: option -H/--hostname is not used" ;;
220              p)  log 2 "$PROG: option -p/--port is not used" ;;              p)  log 2 "$PROG: option -p/--port is not used" ;;
# Line 218  parse_args()   { Line 255  parse_args()   {
255                  glexec-client-cert) args="$args -c " ;;                  glexec-client-cert) args="$args -c " ;;
256                  execute)            args="$args -e " ;;                  execute)            args="$args -e " ;;
257                  timeout)            args="$args -t " ;;                  timeout)            args="$args -t " ;;
258                  shorttimeout)       args="$args -s " ;;                  critical)           args="$args -c " ;;
259                    warning)            args="$args -w " ;;
260                  verbose)            args="$args -v " ;;                  verbose)            args="$args -v " ;;
261                  version)            args="$args -V " ;;                  version)            args="$args -V " ;;
262                  help)               usage ;;                  help)               usage ;;
# Line 226  parse_args()   { Line 264  parse_args()   {
264                  hostname)           args="$args -H " ;;                  hostname)           args="$args -H " ;;
265                  port)               args="$args -p " ;;                  port)               args="$args -p " ;;
266                  url)                args="$args -u " ;;                  url)                args="$args -u " ;;
                 warning)            args="$args -w " ;;  
                 critical)           args="$args -c " ;;  
267                  *)                    *)  
268                      parse_err "$PROG: invalid longoption -- '$subarg'" \                      parse_err "$PROG: invalid longoption -- '$subarg'" \
269                                "Try \`$PROG -h' for more information."                                "Try \`$PROG -h' for more information."
# Line 241  parse_args()   { Line 277  parse_args()   {
277      eval parse_short_args `echo $args`      eval parse_short_args `echo $args`
278  }  }
279    
280    # wait wrapper
281    wait_func() {
282        if [ $# -ne 1 ];then
283            log 2 "wait_func needs exactly one argument"
284            return 1
285        fi
286        if [ $VERBOSE -ge 1 ];then
287            wait $1 2>&1
288        else
289            wait $1 2> /dev/null
290        fi
291        return $?
292    }
293    
294    # kill wrapper
295    kill_func() {
296        if [ $# -eq 0 ];then
297            log 2 "kill_func needs at least one argument"
298            return 1
299        fi
300        ps -fjA|grep "${pid##-}"
301        if [ $# -eq 1 ];then
302            signo="-TERM"
303            pid=$1
304        else
305            signo=$1
306            pid=$2
307        fi
308        log 3 "About to send $signo to $pid"
309        if [ $VERBOSE -gt 1 ];then
310            /bin/kill $signo $pid 2>&1
311        else
312            /bin/kill $signo $pid 2> /dev/null
313        fi
314        return $?
315    }
316    
317  # wait for background process to finish or timeout  # wait for background process to finish or timeout
318  waiter()    {  waiter()    {
319      pid=$1      pid=$1
320      sleep $TIMEOUT      code=0
321        if [ $CRITTIMEOUT -lt $TIMEOUT ];then
322            sleep $CRITTIMEOUT
323            if [ -n "`ps -opid= -p $pid`" ];then
324                log 2 "Child process $pid is running after critical range $CRITTIMEOUT sec, sending SIGTERM"
325                kill_func -$pid
326            fi
327            sleep $((TIMEOUT-CRITTIMEOUT))
328        else
329            sleep $TIMEOUT
330        fi
331    
332      # If process still running: kill it      # If process still running: kill it
333      if [ -n "`ps -opid= -p $pid`" ];then      if [ -n "`ps -opid= -p $pid`" ];then
334          # TIMEOUT exceeded: kill it          # TIMEOUT exceeded: kill it
335          log 2 "Child process $pid is still running after timeout $TIMEOUT"          log 2 "Child process $pid is running after timeout $TIMEOUT sec," \
336          kill -9 $pid                "sending SIGKILL"
337          exit 3          kill_func -9 -$pid
     else  
         exit 0  
338      fi      fi
339  }  }
340    
# Line 338  find_glexec()  { Line 419  find_glexec()  {
419      done      done
420  }  }
421    
422  # Full run:  # Full gLExec run including finding the command and printing nagios status when
423  # - search for glexec command  # successful. In case of timeout, nagios status will come from run_probe.
 # - run glexec  
 # - print nagios status  
424  run_glexec()    {  run_glexec()    {
425      # Store start time      # Store start time
426      t1=$(date +%s)      t1=$(date +%s)
# Line 353  run_glexec()   { Line 432  run_glexec()   {
432      if [ -z "$GLEXEC_EXE" ] ; then      if [ -z "$GLEXEC_EXE" ] ; then
433          code=2  # Critical          code=2  # Critical
434          summary='glexec command not found.'          summary='glexec command not found.'
435          nagios_printout $code $summary          nagios_status $code $summary
436      fi      fi
437      # Test proxy variable      # Test proxy variable
438      if [ -z "$X509_USER_PROXY" ] ; then      if [ -z "$X509_USER_PROXY" ] ; then
439          code=3  # Unknown          code=3  # Unknown
440          summary="\$X509_USER_PROXY is unset."          summary="\$X509_USER_PROXY is unset."
441          nagios_printout $code $summary          nagios_status $code $summary
442      fi      fi
443      # Test proxy file      # Test proxy file
444      if [ ! -f "$X509_USER_PROXY" -o ! -s "$X509_USER_PROXY" ] ; then      if [ ! -f "$X509_USER_PROXY" -o ! -s "$X509_USER_PROXY" ] ; then
445          code=3  # Unknown          code=3  # Unknown
446          summary="\$X509_USER_PROXY does not point to a nonempty file."          summary="\$X509_USER_PROXY does not point to a nonempty file."
447          nagios_printout $code $summary          nagios_status $code $summary
448      fi      fi
449    
450      if [ ! -f "$GLEXEC_CLIENT_CERT" -o ! -s "$X509_USER_PROXY" ] ; then      if [ ! -f "$GLEXEC_CLIENT_CERT" -o ! -s "$X509_USER_PROXY" ] ; then
451          code=3  # Unknown          code=3  # Unknown
452          summary="\$GLEXEC_CLIENT_CERT does not point to a nonempty file."          summary="\$GLEXEC_CLIENT_CERT does not point to a nonempty file."
453          nagios_printout $code $summary          nagios_status $code $summary
454      fi      fi
455    
456      log 2 "Running $GLEXEC_EXE $GLEXEC_CMD_EXE"      log 2 "Running $GLEXEC_EXE $GLEXEC_CMD_EXE"
# Line 386  run_glexec()   { Line 465  run_glexec()   {
465      # Store end time      # Store end time
466      dt=$(( $(date +%s) - t1))      dt=$(( $(date +%s) - t1))
467            
468      if [ "$code" -eq 0 -a $dt -gt $SHORTTIMEOUT ];then      if [ "$code" -eq 0 -a $dt -gt $WARNTIMEOUT ];then
469          code=1 # Warning          code=1 # Warning
470          summary="gLExec took long time to succeed"          summary="gLExec took long time to succeed"
471      fi      fi
472    
473      perfdata="time=${dt}s;$SHORTTIMEOUT;$TIMEOUT;0"      perfdata="time=${dt}s;$WARNTIMEOUT;$CRITTIMEOUT;0"
474      nagios_printout $code "$summary|$perfdata"      nagios_status $code "$summary|$perfdata"
475  }  }
476    
477  ########################################################################  ########################################################################
478  #  #
479  # main program  # main
480  #  #
481  ########################################################################  ########################################################################
482    
483  # Parse cmdline arguments (long ones are converted in corresponding short ones)  {
484  parse_args "$@"      # Turn on jobcontrol (separate process groups for subshells), such that we
485        # can kill the process group for the background processes.
486  # Start glexec_run in background to have control over timeout      set -m
487  run_glexec &  
488  probe_pid=$!      # Parse cmdline arguments (long ones are converted in corresponding short
489        # ones)
490  # Start watch process in background: will kill probe after timeout      parse_args "$@"
491  waiter $probe_pid &      
492  waiter_pid=$!      # Start glexec_run in background to have control over timeout
493        run_glexec &
494  # Wait for run_glexec: it will either end by itself or by the waiter()      probe_pid=$!
495  log 3 "Waiting at most $TIMEOUT seconds for probe $probe_pid to finish"  
496  if [ $VERBOSE -ge 1 ];then      # Start watch process in background: will kill probe after timeout
497      wait $probe_pid      waiter $probe_pid &
498  else      waiter_pid=$!
499      wait $probe_pid 2> /dev/null  
500  fi      # Wait for run_glexec: it will either end by itself or by the waiter()
501  probe_rc=$?      log 3 "Waiting at most $TIMEOUT seconds for probe $probe_pid to finish"
502        wait_func $probe_pid
503  # Kill the waiter if it is still there      probe_rc=$?
504  if [ -n "`ps -opid= -p $waiter_pid`" ];then  
505      kill $waiter_pid 2> /dev/null      # Kill the waiter if it is still there
506        if [ -n "`ps -opid= -p $waiter_pid`" ];then
507            log 3 "Cleaning up waiter process $waiter_pid"
508            kill_func -$waiter_pid
509        fi
510      # Call wait here to prevent logging of termination at end of script      # Call wait here to prevent logging of termination at end of script
511      wait $waiter_pid 2> /dev/null      wait_func $waiter_pid
 fi  
512    
513  # If probe was killed, it's exit value will be outside valid nagios range of 0-3      # If probe was killed, it's exit value will be outside valid nagios range of
514  if [ $probe_rc -gt 3 ];then      # 0-3, if gLExec itself fails, the run_probe will exit with a 3
515      code=2  # Critical      case "$probe_rc" in
516      if [ $VERBOSE -eq 0 ];then          0|1|2|3) # Normal ending of run_glexec, which has called nagios_status
517          nagios_printout $code "probe TIMEOUT of $TIMEOUT seconds exceeded"              code=$probe_rc; exit $code
518      else              ;;
519          nagios_printout $code "probe TIMEOUT of $TIMEOUT seconds exceeded (rc=$probe_rc)"          137)    # run_glexec ended via SIGKILL
520      fi              nagios_status 2 "probe TIMEOUT of $TIMEOUT seconds exceeded"
521  else              ;;
522      code=$probe_rc          143)    # run_glexec ended via SIGTERM
523  fi              nagios_status 2 "probe critical range of $CRITTIMEOUT seconds exceeded"
524                ;;
525            *)      # run_glexec ended prematurely?!
526                nagios_status 3 "background process died unexpectly with rc=$probe_rc"
527                ;;
528        esac;
529    } | nagios_output
530    
531  # run_glexec has finished: parse its exit value and exit with it  exit $?
 exit $probe_rc  

Legend:
Removed from v.2453  
changed lines
  Added in v.2454

grid.support@nikhef.nl
ViewVC Help
Powered by ViewVC 1.1.28