#!/usr/bin/perl # # Copyright (C) Nikhef 2011 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Author: # Mischa Sall\'e # NIKHEF Amsterdam, the Netherlands # ######################################################################## # # Nagios probe to test functioning of gLExec # # Nagios state can be one of the following: # - Missing glexec command: CRITICAL # - input proxies empty: UNKNOWN # - short timeout exceeded: WARNING # - timeout exceeded: CRITICAL # - gLExec exit codes: # 0 glexec succeeded: OK # 201 Client error: CRITICAL # 202 Internal error: CRITICAL # 203 Auth error: CRITICAL # 204 Overlap: CRITICAL # 126 execve failed: WARNING # 128+n signal: WARNING # !=0 rc of payload: WARNING # ######################################################################## # DEFAULTS my $probeversion=0.2; # Note the following defaults can be overridden using cmdline options my $deftimeout=10; # Overall timeout for probe my $defcritical=8; # When to send SIGTERM my $defwarning=5; # When to warn about slow running my $defpayload="id -a"; # Which payload to run ######################################################################## # Logging package # keeps internal log trace which can be dumped with dump_log ######################################################################## package logger; use strict; use warnings; { my $loglevel; my @logstring; # Constructor sub new { my $classname=shift; my $self={}; bless $self; my $level=shift; if (defined $level) { $self->set_loglevel($level); } else { $loglevel=0; } return $self; } # Sets loglevel sub set_loglevel($) { my $self=shift; my $level=shift; $loglevel=$level; } # Logging function: log_func(priority, "logstring\n"); sub log_func($@) { my $self=shift; my $prio=shift; return if ($prio > $loglevel); for my $line (@_) { push @logstring,$line; } } # Dumps log sub get_log(@) { my $self=shift; foreach my $myentry ( @logstring ) { print $myentry; } } } ######################################################################## # Nagios status printing package # Can set and dump nagios status output ######################################################################## package nagstat; { my $code; my $summary; my $perfdata; my @stat; # Constructor sub new() { my $classname=shift; my $self={}; bless $self; $code=3; # Default status unknown $summary=undef; $perfdata=undef; @stat=("OK","WARNING","CRITICAL","UNKNOWN"); return $self; } # Set nagios code (0-3) plus summary sub set_status($$) { my $self=shift; if (!defined $summary) { $code=shift; $summary=shift; } } # Set internal performance data sub set_perfdata($) { my $self=shift; $perfdata=shift; } # Printout nagios status, summary and optionally performance data # return value is code (0-3) sub get_status { if (!defined $summary) { $summary="unknown status"; } if (defined $perfdata) { print $stat[$code].": ".$summary."|".$perfdata."\n"; } else { print $stat[$code].": ".$summary."\n"; } return $code; } } ######################################################################## # Inter process communication package for nagios probes # Starts alarm handler when receiving alarm which checks status of # probe, and terminates or kills it. ######################################################################## package probeipc; use POSIX ":sys_wait_h"; { my $pid; my $wpid; my $status; my $numsent; my $killtime; my $termtime; my $exitfunc; # Constructor: new(exitfunc,[kill time], [term time]) sub new() { my $classname=shift; my $self={}; bless $self; my $exitfunc=shift or die ($classname."::new() needs exitfunc arg\n"); my $killtime=(shift or 10); # probe default timeout is 10 my $termtime=(shift or $killtime); $self->set_exitfunc($exitfunc); $self->set_killtime($killtime); $self->set_termtime($termtime); $pid=-1; $wpid=0; $status=0; $numsent=0; $SIG{'ALRM'} = \&alarm_handler; $SIG{'INT'} = \&int_handler; $SIG{'TERM'} = \&int_handler; return $self; } # Sets time after which to send SIGKILL sub set_killtime($) { my $self=shift; $killtime=shift; } # Sets time after which to send SIGTERM sub set_termtime($) { my $self=shift; $termtime=shift; } # Sets function to call when exiting after sending a SIGKILL sub set_exitfunc($) { my $self=shift; $exitfunc=shift; } # Signal handler for SIGALRM sub alarm_handler() { my ($sig) = @_; my $rc; if ($pid<0) { # No pid, nothing to do logger->log_func(2,"Payload hasn't started yet\n"); nagstat->set_status(2,"probe killtime exceeded"); &$exitfunc(); } # Either is or was a process: test status logger->log_func(2,"subprocess is/was running with pid ".$pid."\n"); if ($wpid!=0) { # process has finished (>0)or waitpid(-1) failed return; } # Get status $wpid=waitpid($pid,WNOHANG); $status=$?; if ($wpid==0) { # Still running if ($killtime<=$termtime || $numsent==1) { logger->log_func(2,"Sending SIGKILL to ".$pid."\n"); kill(9,$pid); nagstat->set_status(2,"probe timeout exceeded"); &$exitfunc(); } logger->log_func(2,"Sending SIGTERM to ".$pid."\n"); kill(15,$pid); $numsent=1; alarm($killtime-$termtime); nagstat->set_status(2,"probe critical time exceeded"); } return; } # Signal handler for SIGINT and SIGTERM sub int_handler() { my ($sig)=@_; logger->log_func(2,"Caught SIG$sig\n"); nagstat->set_status(2,"probe interrupted with SIG$sig"); if ($pid<0) { # No pid, nothing to do logger->log_func(2,"Payload hasn't started yet\n"); &$exitfunc(); } # Either is or was a process: test status if ($wpid!=0) { # process has finished (>0)or waitpid(-1) failed logger->log_func(2,"Subprocess with pid ".$pid." already finished\n"); &$exitfunc(); } # Get status $wpid=waitpid($pid,WNOHANG); $status=$?; if ($wpid==0) { # Still running: send SIGTERM logger->log_func(2,"Sending SIGTERM to ".$pid."\n"); kill(15,$pid); } &$exitfunc(); } # Wait for specified pid and return exitcode and signal number. sub wait_probe() { my $self=shift; my $rc; my $signo; $wpid=waitpid($pid,0) if ($wpid<=0); if ($wpid==$pid) { # probe exited here alarm(0); $rc=$? >> 8; $signo=$? & 127; } elsif ($wpid==-1) { # probe exited in sighandler $rc=$status >> 8; $signo=$status & 127; } return ($rc,$signo); } # Starts specified command: run_probe("command",\$rc,\$signo) and returns 0 # on normal exit, or 1 when command cannot be started. # Output of command is stored as log info. Nagios status is set when # applicable. sub run_probe($$$) { my $self=shift; my $command=shift; my $rc=shift; my $signo=shift; # Start command $pid = open(FOO, $command." 2>&1|"); if (!defined($pid)) { alarm(0); nagstat->set_status(2,"Failed to run $command"); return 1; } while (my $line=) { logger->log_func(3,$line); } ($$rc,$$signo)=$self->wait_probe(); return 0; } } ######################################################################## # Running main probe package ######################################################################## package main; use strict; use warnings; use Env qw(@PATH GLEXEC_LOCATION GLITE_LOCATION X509_USER_PROXY GLEXEC_CLIENT_CERT); use Getopt::Long qw(:config no_ignore_case bundling); my $timeout; # Total maximum runtime for probe my $critical; # Time after which to kill gLExec my $warning; # Time after which to warn about slow gLExec my $payload; # Payload plus arguments: relative uses $PATH to find my $verbose; # Verbosity level # Prints usage output sub usage() { (my $name = $0) =~ s/.*\///; print < maximum runtime for probe, default: $deftimeout sec -w|--warning runtime after which to warn, default: $defwarning sec -c|--critical runtime after which to probe is to be killed, default: $defcritical sec -x|--x509-user-proxy set X509_USER_PROXY to given file -g|--glexec-client-cert set GLEXEC_CLIENT_CERT to given file default: value of variable X509_USER_PROXY -e|--execute command to execute by gLExec default: \"$defpayload\" -v|--verbose be more verbose, more -v means more verbosity -V|--version print version -h|--help show this helptext EOHELP exit 0; } # Prints short usage output (oneline) sub shortusage() { (my $name = $0) =~ s/.*\///; print < \$timeout, "c|critical=i" => \$critical, "w|warning=i" => \$warning, "x|x509-user-proxy=s" => \$X509_USER_PROXY, "g|glexec-client-cert=s" => \$GLEXEC_CLIENT_CERT, "e|execute=s" => \$payload, "v|verbose+" => \$verbose, "help+" => \$help, "h+" => \$shorthelp, "V|version+" => \$version) or &usage and exit(1); $help and &usage and exit(0); $shorthelp and &shortusage and exit(0); $version and &version and exit(0); if (!defined $GLEXEC_CLIENT_CERT) { $GLEXEC_CLIENT_CERT=$X509_USER_PROXY; } } # Exit function: prints nagios status and dumps log sub nagios_exit() { my $rc=nagstat->get_status(); # Logging object logger->get_log(); exit $rc; } # Finds gLExec in path and pre-specified directories sub find_glexec { my $self=shift; my $glexloc; my $dir; my @DEFAULT_PATH=("/usr/local/sbin","/usr/sbin","/sbin", "/usr/local/bin","/usr/bin"); # Try GLEXEC_LOCATION if (defined $GLEXEC_LOCATION) { logger->log_func(3,"GLEXEC_LOCATION=".$GLEXEC_LOCATION."\n"); $glexloc=$GLEXEC_LOCATION."/sbin/glexec"; if (-x $glexloc) { logger->log_func(2,"gLExec found at ".$glexloc."\n"); return $glexloc; } logger->log_func(2,"gLExec NOT found at \$GLEXEC_LOCATION\n"); } # Try GLITE_LOCATION $GLITE_LOCATION="/opt/glite" if (!defined $GLITE_LOCATION); logger->log_func(3,"GLITE_LOCATION=".$GLITE_LOCATION."\n"); @PATH=(".") if (!$PATH[1]); for $dir (@PATH,$GLITE_LOCATION."/sbin",@DEFAULT_PATH) { logger->log_func(3,"Looking for glexec in ".$dir."\n"); $glexloc=$dir."/glexec"; if (-x $glexloc) { logger->log_func(2,"gLExec found at ".$glexloc."\n"); return $glexloc; } } return undef; } sub glexec_to_nagios($$$) { my $rc=shift; my $signo=shift; my $dt=shift; if ($rc==0) { nagstat->set_perfdata("${dt}s;$warning;$critical;0"); if ($dt>=$warning) { nagstat->set_status(1,"gLExec took long time to succeed"); return 0; } else { nagstat->set_status(0,"Success"); return 1; } } elsif ($rc==126) { nagstat->set_status(1,"executable $payload can't be executed ($rc)"); } elsif ($rc==201) { nagstat->set_status(2,"client error ($rc)"); } elsif ($rc==202) { nagstat->set_status(2,"system error ($rc)"); } elsif ($rc==203) { nagstat->set_status(2,"authorization error ($rc)"); } elsif ($rc==204) { nagstat->set_status(2,"exit code overlap error ($rc)"); } elsif ($signo!=0) { nagstat->set_status(2,"exit due to signal $signo ($rc)"); } else { nagstat->set_status(2, "executable $payload failed with non-zero exit code ($rc)"); } return 1; } # Find gLExec command, payload command (when relative), runs it and returns # status sub run_glexec() { my $glexec; my $exitcode; my $signo; my $t1; my $t2; # Make sure to have starttime $t1=time(); # Set alarm before looking for gLExec to prevent NFS timeouts alarm($critical); # Find glexec command if (!defined ($glexec=find_glexec)) { nagstat->set_status(2,"glexec command not found"); return 1; } # Check proxies if (!defined $X509_USER_PROXY) { nagstat->set_status(3,"\$X509_USER_PROXY is unset."); return 1; } if (! -e $X509_USER_PROXY || ! -s $X509_USER_PROXY) { nagstat->set_status(3, "\$X509_USER_PROXY does not point to a nonempty file."); return 1; } if (! -e $GLEXEC_CLIENT_CERT || ! -s $GLEXEC_CLIENT_CERT) { nagstat->set_status(3, "\$GLEXEC_CLIENT_CERT does not point to a nonempty file."); return 1; } # Find full path for payload if it's relative if ($payload !~ /^\/.*/) { (my $name=$payload) =~ s/ .*//; my $fullname; for my $dir (@PATH) { logger->log_func(3,"Looking for ".$name." in ".$dir."\n"); $fullname=$dir."/".$name; if (-x $fullname) { ($payload=$payload) =~ s/^$name/$fullname/; logger->log_func(2,"Payload set to ".$payload."\n"); last; } } } # Run actual probe in child process if (probeipc->run_probe("$glexec $payload",\$exitcode,\$signo)!=0) { return 1; } # Probe exited: find exit status $t2=time(); return glexec_to_nagios($exitcode,$signo,$t2-$t1); } # Parse commandline options getopts(); # Initialize logger and set loglevel logger->new($verbose); # Initialize objects nagstat->new(); # Initialize signal handling probeipc->new(\&nagios_exit,$timeout,$critical); # run actual gLExec probe run_glexec(); # Dump nagios status, log and exit nagios_exit();