/[pdpsoft]/trunk/nagios/glexec/check_glexec
ViewVC logotype

Annotation of /trunk/nagios/glexec/check_glexec

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2456 - (hide annotations) (download)
Mon Dec 5 16:45:13 2011 UTC (10 years, 5 months ago) by msalle
File size: 14629 byte(s)
Add first version of EES probe. Add unsupported options to glexec probe and
split short and long -h/--help

1 msalle 2454 #!/usr/bin/perl
2     #
3     # Copyright (C) Nikhef 2011
4     #
5     # Licensed under the Apache License, Version 2.0 (the "License");
6     # you may not use this file except in compliance with the License.
7     # You may obtain a copy of the License at
8     #
9     # http://www.apache.org/licenses/LICENSE-2.0
10     #
11     # Unless required by applicable law or agreed to in writing, software
12     # distributed under the License is distributed on an "AS IS" BASIS,
13     # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14     # See the License for the specific language governing permissions and
15     # limitations under the License.
16     #
17     # Author:
18     # Mischa Sall\'e <msalle@nikhef.nl>
19     # NIKHEF Amsterdam, the Netherlands
20     #
21     ########################################################################
22     #
23     # Nagios probe to test functioning of gLExec
24     #
25     # Nagios state can be one of the following:
26     # - Missing glexec command: CRITICAL
27     # - input proxies empty: UNKNOWN
28     # - short timeout exceeded: WARNING
29     # - timeout exceeded: CRITICAL
30     # - gLExec exit codes:
31     # 0 glexec succeeded: OK
32     # 201 Client error: CRITICAL
33     # 202 Internal error: CRITICAL
34     # 203 Auth error: CRITICAL
35     # 204 Overlap: CRITICAL
36     # 126 execve failed: WARNING
37     # 128+n signal: WARNING
38     # !=0 rc of payload: WARNING
39     #
40     ########################################################################
41    
42     # DEFAULTS
43     my $probeversion=0.2;
44    
45     # Note the following defaults can be overridden using cmdline options
46     my $deftimeout=10; # Overall timeout for probe
47     my $defcritical=8; # When to send SIGTERM
48     my $defwarning=5; # When to warn about slow running
49     my $defpayload="id -a"; # Which payload to run
50    
51     ########################################################################
52     # Logging package
53     # keeps internal log trace which can be dumped with dump_log
54     ########################################################################
55     package logger;
56     use strict;
57     use warnings;
58     {
59     my $loglevel;
60     my @logstring;
61    
62     # Constructor
63     sub new {
64     my $classname=shift;
65     my $self={}; bless $self;
66     my $level=shift;
67     if (defined $level) {
68     $self->set_loglevel($level);
69     } else {
70     $loglevel=0;
71     }
72     return $self;
73     }
74    
75     # Sets loglevel
76     sub set_loglevel($) {
77     my $self=shift;
78     my $level=shift;
79     $loglevel=$level;
80     }
81    
82     # Logging function: log_func(priority, "logstring\n");
83     sub log_func($@) {
84     my $self=shift;
85     my $prio=shift;
86     return if ($prio > $loglevel);
87     for my $line (@_) {
88     push @logstring,$line;
89     }
90     }
91    
92     # Dumps log
93     sub get_log(@) {
94     my $self=shift;
95     foreach my $myentry ( @logstring ) {
96     print $myentry;
97     }
98     }
99     }
100    
101     ########################################################################
102     # Nagios status printing package
103     # Can set and dump nagios status output
104     ########################################################################
105     package nagstat;
106     {
107     my $code;
108     my $summary;
109     my $perfdata;
110     my @stat;
111    
112     # Constructor
113     sub new() {
114     my $classname=shift;
115     my $self={}; bless $self;
116     $code=3; # Default status unknown
117     $summary=undef;
118     $perfdata=undef;
119     @stat=("OK","WARNING","CRITICAL","UNKNOWN");
120     return $self;
121     }
122    
123     # Set nagios code (0-3) plus summary
124     sub set_status($$) {
125     my $self=shift;
126     if (!defined $summary) {
127     $code=shift;
128     $summary=shift;
129     }
130     }
131    
132     # Set internal performance data
133     sub set_perfdata($) {
134     my $self=shift;
135     $perfdata=shift;
136     }
137    
138     # Printout nagios status, summary and optionally performance data
139     # return value is code (0-3)
140     sub get_status {
141     if (!defined $summary) {
142     $summary="unknown status";
143     }
144     if (defined $perfdata) {
145     print $stat[$code].": ".$summary."|".$perfdata."\n";
146     } else {
147     print $stat[$code].": ".$summary."\n";
148     }
149     return $code;
150     }
151     }
152    
153     ########################################################################
154     # Inter process communication package for nagios probes
155     # Starts alarm handler when receiving alarm which checks status of
156     # probe, and terminates or kills it.
157     ########################################################################
158     package probeipc;
159     use POSIX ":sys_wait_h";
160     {
161     my $pid;
162     my $wpid;
163     my $status;
164     my $numsent;
165     my $killtime;
166     my $termtime;
167     my $exitfunc;
168    
169     # Constructor: new(exitfunc,[kill time], [term time])
170     sub new() {
171     my $classname=shift;
172     my $self={}; bless $self;
173     my $exitfunc=shift or die ($classname."::new() needs exitfunc arg\n");
174     my $killtime=(shift or 10); # probe default timeout is 10
175     my $termtime=(shift or $killtime);
176     $self->set_exitfunc($exitfunc);
177     $self->set_killtime($killtime);
178     $self->set_termtime($termtime);
179     $pid=-1;
180     $wpid=0;
181     $status=0;
182     $numsent=0;
183     $SIG{'ALRM'} = \&alarm_handler;
184     $SIG{'INT'} = \&int_handler;
185     $SIG{'TERM'} = \&int_handler;
186     return $self;
187     }
188    
189     # Sets time after which to send SIGKILL
190     sub set_killtime($) {
191     my $self=shift;
192     $killtime=shift;
193     }
194    
195     # Sets time after which to send SIGTERM
196     sub set_termtime($) {
197     my $self=shift;
198     $termtime=shift;
199     }
200    
201     # Sets function to call when exiting after sending a SIGKILL
202     sub set_exitfunc($) {
203     my $self=shift;
204     $exitfunc=shift;
205     }
206    
207     # Signal handler for SIGALRM
208     sub alarm_handler() {
209     my ($sig) = @_;
210     my $rc;
211     if ($pid<0) { # No pid, nothing to do
212     logger->log_func(2,"Payload hasn't started yet\n");
213     nagstat->set_status(2,"probe killtime exceeded");
214     &$exitfunc();
215     }
216     # Either is or was a process: test status
217     logger->log_func(2,"subprocess is/was running with pid ".$pid."\n");
218     if ($wpid!=0) { # process has finished (>0)or waitpid(-1) failed
219     return;
220     }
221     # Get status
222     $wpid=waitpid($pid,WNOHANG);
223     $status=$?;
224     if ($wpid==0) { # Still running
225     if ($killtime<=$termtime || $numsent==1) {
226     logger->log_func(2,"Sending SIGKILL to ".$pid."\n");
227     kill(9,$pid);
228     nagstat->set_status(2,"probe timeout exceeded");
229     &$exitfunc();
230     }
231     logger->log_func(2,"Sending SIGTERM to ".$pid."\n");
232     kill(15,$pid);
233     $numsent=1;
234     alarm($killtime-$termtime);
235     nagstat->set_status(2,"probe critical time exceeded");
236     }
237     return;
238     }
239    
240     # Signal handler for SIGINT and SIGTERM
241     sub int_handler() {
242     my ($sig)=@_;
243    
244     logger->log_func(2,"Caught SIG$sig\n");
245     nagstat->set_status(2,"probe interrupted with SIG$sig");
246     if ($pid<0) { # No pid, nothing to do
247     logger->log_func(2,"Payload hasn't started yet\n");
248     &$exitfunc();
249     }
250     # Either is or was a process: test status
251     if ($wpid!=0) { # process has finished (>0)or waitpid(-1) failed
252     logger->log_func(2,"Subprocess with pid ".$pid." already finished\n");
253     &$exitfunc();
254     }
255     # Get status
256     $wpid=waitpid($pid,WNOHANG);
257     $status=$?;
258     if ($wpid==0) { # Still running: send SIGTERM
259     logger->log_func(2,"Sending SIGTERM to ".$pid."\n");
260     kill(15,$pid);
261     }
262     &$exitfunc();
263     }
264    
265     # Wait for specified pid and return exitcode and signal number.
266     sub wait_probe() {
267     my $self=shift;
268     my $rc;
269     my $signo;
270    
271     $wpid=waitpid($pid,0) if ($wpid<=0);
272     if ($wpid==$pid) { # probe exited here
273     alarm(0);
274     $rc=$? >> 8;
275     $signo=$? & 127;
276     } elsif ($wpid==-1) { # probe exited in sighandler
277     $rc=$status >> 8;
278     $signo=$status & 127;
279     }
280     return ($rc,$signo);
281     }
282    
283     # Starts specified command: run_probe("command",\$rc,\$signo) and returns 0
284     # on normal exit, or 1 when command cannot be started.
285     # Output of command is stored as log info. Nagios status is set when
286     # applicable.
287     sub run_probe($$$) {
288     my $self=shift;
289     my $command=shift;
290     my $rc=shift;
291     my $signo=shift;
292    
293     # Start command
294     $pid = open(FOO, $command." 2>&1|");
295     if (!defined($pid)) {
296     alarm(0);
297     nagstat->set_status(2,"Failed to run $command");
298     return 1;
299     }
300     while (my $line=<FOO>) {
301     logger->log_func(3,$line);
302     }
303     ($$rc,$$signo)=$self->wait_probe();
304     return 0;
305     }
306     }
307    
308     ########################################################################
309     # Running main probe package
310     ########################################################################
311     package main;
312     use strict;
313     use warnings;
314    
315     use Env qw(@PATH GLEXEC_LOCATION GLITE_LOCATION X509_USER_PROXY GLEXEC_CLIENT_CERT);
316     use Getopt::Long qw(:config no_ignore_case bundling);
317    
318     my $timeout; # Total maximum runtime for probe
319     my $critical; # Time after which to kill gLExec
320     my $warning; # Time after which to warn about slow gLExec
321     my $payload; # Payload plus arguments: relative uses $PATH to find
322     my $verbose; # Verbosity level
323    
324     # Prints usage output
325     sub usage() {
326     (my $name = $0) =~ s/.*\///;
327     print <<EOHELP;
328     Usage: $name [options]
329    
330     Options:
331     -t|--timeout <timeout> maximum runtime for probe, default: $deftimeout sec
332     -w|--warning <timeout> runtime after which to warn, default: $defwarning sec
333     -c|--critical <timeout> runtime after which to probe is to be killed,
334     default: $defcritical sec
335     -x|--x509-user-proxy <file> set X509_USER_PROXY to given file
336     -g|--glexec-client-cert <file> set GLEXEC_CLIENT_CERT to given file
337     default: value of variable X509_USER_PROXY
338     -e|--execute <cmd> command to execute by gLExec
339     default: \"$defpayload\"
340     -v|--verbose be more verbose, more -v means more verbosity
341     -V|--version print version
342 msalle 2456 --help show this helptext
343     -h show short usage information
344 msalle 2454 EOHELP
345     exit 0;
346     }
347    
348     # Prints short usage output (oneline)
349     sub shortusage() {
350     (my $name = $0) =~ s/.*\///;
351     print <<EOHELP;
352     Usage: $name [options]
353     EOHELP
354     }
355    
356     # Prints probe version
357     sub version() {
358     (my $name = $0) =~ s/.*\///;
359     print <<EOHELP;
360     $name version: $probeversion
361     EOHELP
362     }
363    
364     # Parses command line options and sets global variables
365     sub getopts() {
366     my $x509proxy;
367     my $clientcert;
368     my $version;
369     my $help;
370     my $shorthelp;
371    
372     $timeout=$deftimeout;
373     $critical=$defcritical;
374     $warning=$defwarning;
375     $payload=$defpayload;
376     GetOptions(
377     "t|timeout=i" => \$timeout,
378     "c|critical=i" => \$critical,
379     "w|warning=i" => \$warning,
380     "x|x509-user-proxy=s" => \$X509_USER_PROXY,
381     "g|glexec-client-cert=s" => \$GLEXEC_CLIENT_CERT,
382     "e|execute=s" => \$payload,
383     "v|verbose+" => \$verbose,
384     "help+" => \$help,
385     "h+" => \$shorthelp,
386 msalle 2456 "V|version+" => \$version,
387     "H|host",
388     "p|port"
389     ) or &usage and exit(1);
390 msalle 2454
391     $help and &usage and exit(0);
392     $shorthelp and &shortusage and exit(0);
393     $version and &version and exit(0);
394     if (!defined $GLEXEC_CLIENT_CERT) {
395     $GLEXEC_CLIENT_CERT=$X509_USER_PROXY;
396     }
397     }
398    
399     # Exit function: prints nagios status and dumps log
400     sub nagios_exit() {
401     my $rc=nagstat->get_status();
402    
403     # Logging object
404     logger->get_log();
405    
406     exit $rc;
407     }
408    
409     # Finds gLExec in path and pre-specified directories
410     sub find_glexec {
411     my $self=shift;
412     my $glexloc;
413     my $dir;
414     my @DEFAULT_PATH=("/usr/local/sbin","/usr/sbin","/sbin",
415     "/usr/local/bin","/usr/bin");
416    
417     # Try GLEXEC_LOCATION
418     if (defined $GLEXEC_LOCATION) {
419     logger->log_func(3,"GLEXEC_LOCATION=".$GLEXEC_LOCATION."\n");
420     $glexloc=$GLEXEC_LOCATION."/sbin/glexec";
421     if (-x $glexloc) {
422     logger->log_func(2,"gLExec found at ".$glexloc."\n");
423     return $glexloc;
424     }
425     logger->log_func(2,"gLExec NOT found at \$GLEXEC_LOCATION\n");
426     }
427    
428     # Try GLITE_LOCATION
429     $GLITE_LOCATION="/opt/glite" if (!defined $GLITE_LOCATION);
430     logger->log_func(3,"GLITE_LOCATION=".$GLITE_LOCATION."\n");
431    
432     @PATH=(".") if (!$PATH[1]);
433    
434     for $dir (@PATH,$GLITE_LOCATION."/sbin",@DEFAULT_PATH) {
435     logger->log_func(3,"Looking for glexec in ".$dir."\n");
436     $glexloc=$dir."/glexec";
437     if (-x $glexloc) {
438     logger->log_func(2,"gLExec found at ".$glexloc."\n");
439     return $glexloc;
440     }
441     }
442     return undef;
443     }
444    
445     sub glexec_to_nagios($$$) {
446     my $rc=shift;
447     my $signo=shift;
448     my $dt=shift;
449    
450     if ($rc==0) {
451     nagstat->set_perfdata("${dt}s;$warning;$critical;0");
452     if ($dt>=$warning) {
453     nagstat->set_status(1,"gLExec took long time to succeed");
454     return 0;
455     } else {
456     nagstat->set_status(0,"Success");
457     return 1;
458     }
459     } elsif ($rc==126) {
460     nagstat->set_status(1,"executable $payload can't be executed ($rc)");
461     } elsif ($rc==201) {
462     nagstat->set_status(2,"client error ($rc)");
463     } elsif ($rc==202) {
464     nagstat->set_status(2,"system error ($rc)");
465     } elsif ($rc==203) {
466     nagstat->set_status(2,"authorization error ($rc)");
467     } elsif ($rc==204) {
468     nagstat->set_status(2,"exit code overlap error ($rc)");
469     } elsif ($signo!=0) {
470     nagstat->set_status(2,"exit due to signal $signo ($rc)");
471     } else {
472     nagstat->set_status(2,
473     "executable $payload failed with non-zero exit code ($rc)");
474     }
475     return 1;
476     }
477    
478     # Find gLExec command, payload command (when relative), runs it and returns
479     # status
480     sub run_glexec() {
481     my $glexec;
482     my $exitcode;
483     my $signo;
484     my $t1;
485     my $t2;
486    
487     # Make sure to have starttime
488     $t1=time();
489    
490     # Set alarm before looking for gLExec to prevent NFS timeouts
491     alarm($critical);
492    
493     # Find glexec command
494     if (!defined ($glexec=find_glexec)) {
495     nagstat->set_status(2,"glexec command not found");
496     return 1;
497     }
498    
499     # Check proxies
500     if (!defined $X509_USER_PROXY) {
501     nagstat->set_status(3,"\$X509_USER_PROXY is unset.");
502     return 1;
503     }
504     if (! -e $X509_USER_PROXY || ! -s $X509_USER_PROXY) {
505     nagstat->set_status(3,
506     "\$X509_USER_PROXY does not point to a nonempty file.");
507     return 1;
508     }
509     if (! -e $GLEXEC_CLIENT_CERT || ! -s $GLEXEC_CLIENT_CERT) {
510     nagstat->set_status(3,
511     "\$GLEXEC_CLIENT_CERT does not point to a nonempty file.");
512     return 1;
513     }
514    
515     # Find full path for payload if it's relative
516     if ($payload !~ /^\/.*/) {
517     (my $name=$payload) =~ s/ .*//;
518     my $fullname;
519     for my $dir (@PATH) {
520     logger->log_func(3,"Looking for ".$name." in ".$dir."\n");
521     $fullname=$dir."/".$name;
522     if (-x $fullname) {
523     ($payload=$payload) =~ s/^$name/$fullname/;
524     logger->log_func(2,"Payload set to ".$payload."\n");
525     last;
526     }
527     }
528     }
529    
530     # Run actual probe in child process
531     if (probeipc->run_probe("$glexec $payload",\$exitcode,\$signo)!=0) {
532     return 1;
533     }
534    
535     # Probe exited: find exit status
536     $t2=time();
537     return glexec_to_nagios($exitcode,$signo,$t2-$t1);
538     }
539    
540     # Parse commandline options
541     getopts();
542    
543     # Initialize logger and set loglevel
544     logger->new($verbose);
545    
546     # Initialize objects
547     nagstat->new();
548    
549     # Initialize signal handling
550     probeipc->new(\&nagios_exit,$timeout,$critical);
551    
552     # run actual gLExec probe
553     run_glexec();
554    
555     # Dump nagios status, log and exit
556     nagios_exit();
557    

Properties

Name Value
svn:executable *

grid.support@nikhef.nl
ViewVC Help
Powered by ViewVC 1.1.28