/[pdpsoft]/trunk/nagios/glexec/check_glexec
ViewVC logotype

Annotation of /trunk/nagios/glexec/check_glexec

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2457 - (hide annotations) (download)
Tue Dec 6 10:19:13 2011 UTC (10 years, 5 months ago) by msalle
File size: 14639 byte(s)
Add (unused) -u/--url option to glexec probe, add EES probe.

1 msalle 2454 #!/usr/bin/perl
2     #
3     # Copyright (C) Nikhef 2011
4     #
5     # Licensed under the Apache License, Version 2.0 (the "License");
6     # you may not use this file except in compliance with the License.
7     # You may obtain a copy of the License at
8     #
9     # http://www.apache.org/licenses/LICENSE-2.0
10     #
11     # Unless required by applicable law or agreed to in writing, software
12     # distributed under the License is distributed on an "AS IS" BASIS,
13     # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14     # See the License for the specific language governing permissions and
15     # limitations under the License.
16     #
17     # Author:
18     # Mischa Sall\'e <msalle@nikhef.nl>
19     # NIKHEF Amsterdam, the Netherlands
20     #
21     ########################################################################
22     #
23     # Nagios probe to test functioning of gLExec
24     #
25     # Nagios state can be one of the following:
26     # - Missing glexec command: CRITICAL
27     # - input proxies empty: UNKNOWN
28     # - short timeout exceeded: WARNING
29     # - timeout exceeded: CRITICAL
30     # - gLExec exit codes:
31     # 0 glexec succeeded: OK
32     # 201 Client error: CRITICAL
33     # 202 Internal error: CRITICAL
34     # 203 Auth error: CRITICAL
35     # 204 Overlap: CRITICAL
36     # 126 execve failed: WARNING
37     # 128+n signal: WARNING
38     # !=0 rc of payload: WARNING
39     #
40     ########################################################################
41    
42     # DEFAULTS
43     my $probeversion=0.2;
44    
45     # Note the following defaults can be overridden using cmdline options
46     my $deftimeout=10; # Overall timeout for probe
47     my $defcritical=8; # When to send SIGTERM
48     my $defwarning=5; # When to warn about slow running
49     my $defpayload="id -a"; # Which payload to run
50    
51     ########################################################################
52     # Logging package
53     # keeps internal log trace which can be dumped with dump_log
54     ########################################################################
55     package logger;
56     use strict;
57     use warnings;
58     {
59     my $loglevel;
60     my @logstring;
61    
62     # Constructor
63     sub new {
64     my $classname=shift;
65     my $self={}; bless $self;
66     my $level=shift;
67     if (defined $level) {
68     $self->set_loglevel($level);
69     } else {
70     $loglevel=0;
71     }
72     return $self;
73     }
74    
75     # Sets loglevel
76     sub set_loglevel($) {
77     my $self=shift;
78     my $level=shift;
79     $loglevel=$level;
80     }
81    
82     # Logging function: log_func(priority, "logstring\n");
83     sub log_func($@) {
84     my $self=shift;
85     my $prio=shift;
86     return if ($prio > $loglevel);
87     for my $line (@_) {
88     push @logstring,$line;
89     }
90     }
91    
92     # Dumps log
93     sub get_log(@) {
94     my $self=shift;
95     foreach my $myentry ( @logstring ) {
96     print $myentry;
97     }
98     }
99     }
100    
101     ########################################################################
102     # Nagios status printing package
103     # Can set and dump nagios status output
104     ########################################################################
105     package nagstat;
106     {
107     my $code;
108     my $summary;
109     my $perfdata;
110     my @stat;
111    
112     # Constructor
113     sub new() {
114     my $classname=shift;
115     my $self={}; bless $self;
116     $code=3; # Default status unknown
117     $summary=undef;
118     $perfdata=undef;
119     @stat=("OK","WARNING","CRITICAL","UNKNOWN");
120     return $self;
121     }
122    
123     # Set nagios code (0-3) plus summary
124     sub set_status($$) {
125     my $self=shift;
126     if (!defined $summary) {
127     $code=shift;
128     $summary=shift;
129     }
130     }
131    
132     # Set internal performance data
133     sub set_perfdata($) {
134     my $self=shift;
135     $perfdata=shift;
136     }
137    
138     # Printout nagios status, summary and optionally performance data
139     # return value is code (0-3)
140     sub get_status {
141     if (!defined $summary) {
142     $summary="unknown status";
143     }
144     if (defined $perfdata) {
145     print $stat[$code].": ".$summary."|".$perfdata."\n";
146     } else {
147     print $stat[$code].": ".$summary."\n";
148     }
149     return $code;
150     }
151     }
152    
153     ########################################################################
154     # Inter process communication package for nagios probes
155     # Starts alarm handler when receiving alarm which checks status of
156     # probe, and terminates or kills it.
157     ########################################################################
158     package probeipc;
159     use POSIX ":sys_wait_h";
160     {
161     my $pid;
162     my $wpid;
163     my $status;
164     my $numsent;
165     my $killtime;
166     my $termtime;
167     my $exitfunc;
168    
169     # Constructor: new(exitfunc,[kill time], [term time])
170     sub new() {
171     my $classname=shift;
172     my $self={}; bless $self;
173     my $exitfunc=shift or die ($classname."::new() needs exitfunc arg\n");
174     my $killtime=(shift or 10); # probe default timeout is 10
175     my $termtime=(shift or $killtime);
176     $self->set_exitfunc($exitfunc);
177     $self->set_killtime($killtime);
178     $self->set_termtime($termtime);
179     $pid=-1;
180     $wpid=0;
181     $status=0;
182     $numsent=0;
183     $SIG{'ALRM'} = \&alarm_handler;
184     $SIG{'INT'} = \&int_handler;
185     $SIG{'TERM'} = \&int_handler;
186     return $self;
187     }
188    
189     # Sets time after which to send SIGKILL
190     sub set_killtime($) {
191     my $self=shift;
192     $killtime=shift;
193     }
194    
195     # Sets time after which to send SIGTERM
196     sub set_termtime($) {
197     my $self=shift;
198     $termtime=shift;
199     }
200    
201     # Sets function to call when exiting after sending a SIGKILL
202     sub set_exitfunc($) {
203     my $self=shift;
204     $exitfunc=shift;
205     }
206    
207     # Signal handler for SIGALRM
208     sub alarm_handler() {
209     my ($sig) = @_;
210     my $rc;
211     if ($pid<0) { # No pid, nothing to do
212     logger->log_func(2,"Payload hasn't started yet\n");
213     nagstat->set_status(2,"probe killtime exceeded");
214     &$exitfunc();
215     }
216     # Either is or was a process: test status
217     logger->log_func(2,"subprocess is/was running with pid ".$pid."\n");
218     if ($wpid!=0) { # process has finished (>0)or waitpid(-1) failed
219     return;
220     }
221     # Get status
222     $wpid=waitpid($pid,WNOHANG);
223     $status=$?;
224     if ($wpid==0) { # Still running
225     if ($killtime<=$termtime || $numsent==1) {
226     logger->log_func(2,"Sending SIGKILL to ".$pid."\n");
227     kill(9,$pid);
228     nagstat->set_status(2,"probe timeout exceeded");
229     &$exitfunc();
230     }
231     logger->log_func(2,"Sending SIGTERM to ".$pid."\n");
232     kill(15,$pid);
233     $numsent=1;
234     alarm($killtime-$termtime);
235     nagstat->set_status(2,"probe critical time exceeded");
236     }
237     return;
238     }
239    
240     # Signal handler for SIGINT and SIGTERM
241     sub int_handler() {
242     my ($sig)=@_;
243    
244     logger->log_func(2,"Caught SIG$sig\n");
245     nagstat->set_status(2,"probe interrupted with SIG$sig");
246     if ($pid<0) { # No pid, nothing to do
247     logger->log_func(2,"Payload hasn't started yet\n");
248     &$exitfunc();
249     }
250     # Either is or was a process: test status
251     if ($wpid!=0) { # process has finished (>0)or waitpid(-1) failed
252     logger->log_func(2,"Subprocess with pid ".$pid." already finished\n");
253     &$exitfunc();
254     }
255     # Get status
256     $wpid=waitpid($pid,WNOHANG);
257     $status=$?;
258     if ($wpid==0) { # Still running: send SIGTERM
259     logger->log_func(2,"Sending SIGTERM to ".$pid."\n");
260     kill(15,$pid);
261     }
262     &$exitfunc();
263     }
264    
265     # Wait for specified pid and return exitcode and signal number.
266     sub wait_probe() {
267     my $self=shift;
268     my $rc;
269     my $signo;
270    
271     $wpid=waitpid($pid,0) if ($wpid<=0);
272     if ($wpid==$pid) { # probe exited here
273     alarm(0);
274     $rc=$? >> 8;
275     $signo=$? & 127;
276     } elsif ($wpid==-1) { # probe exited in sighandler
277     $rc=$status >> 8;
278     $signo=$status & 127;
279     }
280     return ($rc,$signo);
281     }
282    
283     # Starts specified command: run_probe("command",\$rc,\$signo) and returns 0
284     # on normal exit, or 1 when command cannot be started.
285     # Output of command is stored as log info. Nagios status is set when
286     # applicable.
287     sub run_probe($$$) {
288     my $self=shift;
289     my $command=shift;
290     my $rc=shift;
291     my $signo=shift;
292    
293     # Start command
294     $pid = open(FOO, $command." 2>&1|");
295     if (!defined($pid)) {
296     alarm(0);
297     nagstat->set_status(2,"Failed to run $command");
298     return 1;
299     }
300     while (my $line=<FOO>) {
301     logger->log_func(3,$line);
302     }
303     ($$rc,$$signo)=$self->wait_probe();
304     return 0;
305     }
306     }
307    
308     ########################################################################
309     # Running main probe package
310     ########################################################################
311     package main;
312     use strict;
313     use warnings;
314    
315     use Env qw(@PATH GLEXEC_LOCATION GLITE_LOCATION X509_USER_PROXY GLEXEC_CLIENT_CERT);
316     use Getopt::Long qw(:config no_ignore_case bundling);
317    
318     my $timeout; # Total maximum runtime for probe
319     my $critical; # Time after which to kill gLExec
320     my $warning; # Time after which to warn about slow gLExec
321     my $payload; # Payload plus arguments: relative uses $PATH to find
322     my $verbose; # Verbosity level
323    
324     # Prints usage output
325     sub usage() {
326     (my $name = $0) =~ s/.*\///;
327     print <<EOHELP;
328     Usage: $name [options]
329    
330     Options:
331     -t|--timeout <timeout> maximum runtime for probe, default: $deftimeout sec
332     -w|--warning <timeout> runtime after which to warn, default: $defwarning sec
333     -c|--critical <timeout> runtime after which to probe is to be killed,
334     default: $defcritical sec
335     -x|--x509-user-proxy <file> set X509_USER_PROXY to given file
336     -g|--glexec-client-cert <file> set GLEXEC_CLIENT_CERT to given file
337     default: value of variable X509_USER_PROXY
338     -e|--execute <cmd> command to execute by gLExec
339     default: \"$defpayload\"
340     -v|--verbose be more verbose, more -v means more verbosity
341     -V|--version print version
342 msalle 2456 --help show this helptext
343     -h show short usage information
344 msalle 2454 EOHELP
345     exit 0;
346     }
347    
348     # Prints short usage output (oneline)
349     sub shortusage() {
350     (my $name = $0) =~ s/.*\///;
351     print <<EOHELP;
352     Usage: $name [options]
353     EOHELP
354     }
355    
356     # Prints probe version
357     sub version() {
358     (my $name = $0) =~ s/.*\///;
359     print <<EOHELP;
360     $name version: $probeversion
361     EOHELP
362     }
363    
364     # Parses command line options and sets global variables
365     sub getopts() {
366     my $x509proxy;
367     my $clientcert;
368     my $version;
369     my $help;
370     my $shorthelp;
371    
372     $timeout=$deftimeout;
373     $critical=$defcritical;
374     $warning=$defwarning;
375     $payload=$defpayload;
376     GetOptions(
377     "t|timeout=i" => \$timeout,
378     "c|critical=i" => \$critical,
379     "w|warning=i" => \$warning,
380     "x|x509-user-proxy=s" => \$X509_USER_PROXY,
381     "g|glexec-client-cert=s" => \$GLEXEC_CLIENT_CERT,
382     "e|execute=s" => \$payload,
383     "v|verbose+" => \$verbose,
384     "help+" => \$help,
385     "h+" => \$shorthelp,
386 msalle 2456 "V|version+" => \$version,
387     "H|host",
388 msalle 2457 "p|port",
389     "u|url"
390 msalle 2456 ) or &usage and exit(1);
391 msalle 2454
392     $help and &usage and exit(0);
393     $shorthelp and &shortusage and exit(0);
394     $version and &version and exit(0);
395     if (!defined $GLEXEC_CLIENT_CERT) {
396     $GLEXEC_CLIENT_CERT=$X509_USER_PROXY;
397     }
398     }
399    
400     # Exit function: prints nagios status and dumps log
401     sub nagios_exit() {
402     my $rc=nagstat->get_status();
403    
404     # Logging object
405     logger->get_log();
406    
407     exit $rc;
408     }
409    
410     # Finds gLExec in path and pre-specified directories
411     sub find_glexec {
412     my $self=shift;
413     my $glexloc;
414     my $dir;
415     my @DEFAULT_PATH=("/usr/local/sbin","/usr/sbin","/sbin",
416     "/usr/local/bin","/usr/bin");
417    
418     # Try GLEXEC_LOCATION
419     if (defined $GLEXEC_LOCATION) {
420     logger->log_func(3,"GLEXEC_LOCATION=".$GLEXEC_LOCATION."\n");
421     $glexloc=$GLEXEC_LOCATION."/sbin/glexec";
422     if (-x $glexloc) {
423     logger->log_func(2,"gLExec found at ".$glexloc."\n");
424     return $glexloc;
425     }
426     logger->log_func(2,"gLExec NOT found at \$GLEXEC_LOCATION\n");
427     }
428    
429     # Try GLITE_LOCATION
430     $GLITE_LOCATION="/opt/glite" if (!defined $GLITE_LOCATION);
431     logger->log_func(3,"GLITE_LOCATION=".$GLITE_LOCATION."\n");
432    
433     @PATH=(".") if (!$PATH[1]);
434    
435     for $dir (@PATH,$GLITE_LOCATION."/sbin",@DEFAULT_PATH) {
436     logger->log_func(3,"Looking for glexec in ".$dir."\n");
437     $glexloc=$dir."/glexec";
438     if (-x $glexloc) {
439     logger->log_func(2,"gLExec found at ".$glexloc."\n");
440     return $glexloc;
441     }
442     }
443     return undef;
444     }
445    
446     sub glexec_to_nagios($$$) {
447     my $rc=shift;
448     my $signo=shift;
449     my $dt=shift;
450    
451     if ($rc==0) {
452     nagstat->set_perfdata("${dt}s;$warning;$critical;0");
453     if ($dt>=$warning) {
454     nagstat->set_status(1,"gLExec took long time to succeed");
455     return 0;
456     } else {
457     nagstat->set_status(0,"Success");
458     return 1;
459     }
460     } elsif ($rc==126) {
461     nagstat->set_status(1,"executable $payload can't be executed ($rc)");
462     } elsif ($rc==201) {
463     nagstat->set_status(2,"client error ($rc)");
464     } elsif ($rc==202) {
465     nagstat->set_status(2,"system error ($rc)");
466     } elsif ($rc==203) {
467     nagstat->set_status(2,"authorization error ($rc)");
468     } elsif ($rc==204) {
469     nagstat->set_status(2,"exit code overlap error ($rc)");
470     } elsif ($signo!=0) {
471     nagstat->set_status(2,"exit due to signal $signo ($rc)");
472     } else {
473     nagstat->set_status(2,
474     "executable $payload failed with non-zero exit code ($rc)");
475     }
476     return 1;
477     }
478    
479     # Find gLExec command, payload command (when relative), runs it and returns
480     # status
481     sub run_glexec() {
482     my $glexec;
483     my $exitcode;
484     my $signo;
485     my $t1;
486     my $t2;
487    
488     # Make sure to have starttime
489     $t1=time();
490    
491     # Set alarm before looking for gLExec to prevent NFS timeouts
492     alarm($critical);
493    
494     # Find glexec command
495     if (!defined ($glexec=find_glexec)) {
496     nagstat->set_status(2,"glexec command not found");
497     return 1;
498     }
499    
500     # Check proxies
501     if (!defined $X509_USER_PROXY) {
502     nagstat->set_status(3,"\$X509_USER_PROXY is unset.");
503     return 1;
504     }
505     if (! -e $X509_USER_PROXY || ! -s $X509_USER_PROXY) {
506     nagstat->set_status(3,
507     "\$X509_USER_PROXY does not point to a nonempty file.");
508     return 1;
509     }
510     if (! -e $GLEXEC_CLIENT_CERT || ! -s $GLEXEC_CLIENT_CERT) {
511     nagstat->set_status(3,
512     "\$GLEXEC_CLIENT_CERT does not point to a nonempty file.");
513     return 1;
514     }
515    
516     # Find full path for payload if it's relative
517     if ($payload !~ /^\/.*/) {
518     (my $name=$payload) =~ s/ .*//;
519     my $fullname;
520     for my $dir (@PATH) {
521     logger->log_func(3,"Looking for ".$name." in ".$dir."\n");
522     $fullname=$dir."/".$name;
523     if (-x $fullname) {
524     ($payload=$payload) =~ s/^$name/$fullname/;
525     logger->log_func(2,"Payload set to ".$payload."\n");
526     last;
527     }
528     }
529     }
530    
531     # Run actual probe in child process
532     if (probeipc->run_probe("$glexec $payload",\$exitcode,\$signo)!=0) {
533     return 1;
534     }
535    
536     # Probe exited: find exit status
537     $t2=time();
538     return glexec_to_nagios($exitcode,$signo,$t2-$t1);
539     }
540    
541     # Parse commandline options
542     getopts();
543    
544     # Initialize logger and set loglevel
545     logger->new($verbose);
546    
547     # Initialize objects
548     nagstat->new();
549    
550     # Initialize signal handling
551     probeipc->new(\&nagios_exit,$timeout,$critical);
552    
553     # run actual gLExec probe
554     run_glexec();
555    
556     # Dump nagios status, log and exit
557     nagios_exit();
558    

Properties

Name Value
svn:executable *

grid.support@nikhef.nl
ViewVC Help
Powered by ViewVC 1.1.28