/[pdpsoft]/trunk/nagios/glexec/check_glexec
ViewVC logotype

Contents of /trunk/nagios/glexec/check_glexec

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2456 - (show annotations) (download)
Mon Dec 5 16:45:13 2011 UTC (10 years, 5 months ago) by msalle
File size: 14629 byte(s)
Add first version of EES probe. Add unsupported options to glexec probe and
split short and long -h/--help

1 #!/usr/bin/perl
2 #
3 # Copyright (C) Nikhef 2011
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 #
17 # Author:
18 # Mischa Sall\'e <msalle@nikhef.nl>
19 # NIKHEF Amsterdam, the Netherlands
20 #
21 ########################################################################
22 #
23 # Nagios probe to test functioning of gLExec
24 #
25 # Nagios state can be one of the following:
26 # - Missing glexec command: CRITICAL
27 # - input proxies empty: UNKNOWN
28 # - short timeout exceeded: WARNING
29 # - timeout exceeded: CRITICAL
30 # - gLExec exit codes:
31 # 0 glexec succeeded: OK
32 # 201 Client error: CRITICAL
33 # 202 Internal error: CRITICAL
34 # 203 Auth error: CRITICAL
35 # 204 Overlap: CRITICAL
36 # 126 execve failed: WARNING
37 # 128+n signal: WARNING
38 # !=0 rc of payload: WARNING
39 #
40 ########################################################################
41
42 # DEFAULTS
43 my $probeversion=0.2;
44
45 # Note the following defaults can be overridden using cmdline options
46 my $deftimeout=10; # Overall timeout for probe
47 my $defcritical=8; # When to send SIGTERM
48 my $defwarning=5; # When to warn about slow running
49 my $defpayload="id -a"; # Which payload to run
50
51 ########################################################################
52 # Logging package
53 # keeps internal log trace which can be dumped with dump_log
54 ########################################################################
55 package logger;
56 use strict;
57 use warnings;
58 {
59 my $loglevel;
60 my @logstring;
61
62 # Constructor
63 sub new {
64 my $classname=shift;
65 my $self={}; bless $self;
66 my $level=shift;
67 if (defined $level) {
68 $self->set_loglevel($level);
69 } else {
70 $loglevel=0;
71 }
72 return $self;
73 }
74
75 # Sets loglevel
76 sub set_loglevel($) {
77 my $self=shift;
78 my $level=shift;
79 $loglevel=$level;
80 }
81
82 # Logging function: log_func(priority, "logstring\n");
83 sub log_func($@) {
84 my $self=shift;
85 my $prio=shift;
86 return if ($prio > $loglevel);
87 for my $line (@_) {
88 push @logstring,$line;
89 }
90 }
91
92 # Dumps log
93 sub get_log(@) {
94 my $self=shift;
95 foreach my $myentry ( @logstring ) {
96 print $myentry;
97 }
98 }
99 }
100
101 ########################################################################
102 # Nagios status printing package
103 # Can set and dump nagios status output
104 ########################################################################
105 package nagstat;
106 {
107 my $code;
108 my $summary;
109 my $perfdata;
110 my @stat;
111
112 # Constructor
113 sub new() {
114 my $classname=shift;
115 my $self={}; bless $self;
116 $code=3; # Default status unknown
117 $summary=undef;
118 $perfdata=undef;
119 @stat=("OK","WARNING","CRITICAL","UNKNOWN");
120 return $self;
121 }
122
123 # Set nagios code (0-3) plus summary
124 sub set_status($$) {
125 my $self=shift;
126 if (!defined $summary) {
127 $code=shift;
128 $summary=shift;
129 }
130 }
131
132 # Set internal performance data
133 sub set_perfdata($) {
134 my $self=shift;
135 $perfdata=shift;
136 }
137
138 # Printout nagios status, summary and optionally performance data
139 # return value is code (0-3)
140 sub get_status {
141 if (!defined $summary) {
142 $summary="unknown status";
143 }
144 if (defined $perfdata) {
145 print $stat[$code].": ".$summary."|".$perfdata."\n";
146 } else {
147 print $stat[$code].": ".$summary."\n";
148 }
149 return $code;
150 }
151 }
152
153 ########################################################################
154 # Inter process communication package for nagios probes
155 # Starts alarm handler when receiving alarm which checks status of
156 # probe, and terminates or kills it.
157 ########################################################################
158 package probeipc;
159 use POSIX ":sys_wait_h";
160 {
161 my $pid;
162 my $wpid;
163 my $status;
164 my $numsent;
165 my $killtime;
166 my $termtime;
167 my $exitfunc;
168
169 # Constructor: new(exitfunc,[kill time], [term time])
170 sub new() {
171 my $classname=shift;
172 my $self={}; bless $self;
173 my $exitfunc=shift or die ($classname."::new() needs exitfunc arg\n");
174 my $killtime=(shift or 10); # probe default timeout is 10
175 my $termtime=(shift or $killtime);
176 $self->set_exitfunc($exitfunc);
177 $self->set_killtime($killtime);
178 $self->set_termtime($termtime);
179 $pid=-1;
180 $wpid=0;
181 $status=0;
182 $numsent=0;
183 $SIG{'ALRM'} = \&alarm_handler;
184 $SIG{'INT'} = \&int_handler;
185 $SIG{'TERM'} = \&int_handler;
186 return $self;
187 }
188
189 # Sets time after which to send SIGKILL
190 sub set_killtime($) {
191 my $self=shift;
192 $killtime=shift;
193 }
194
195 # Sets time after which to send SIGTERM
196 sub set_termtime($) {
197 my $self=shift;
198 $termtime=shift;
199 }
200
201 # Sets function to call when exiting after sending a SIGKILL
202 sub set_exitfunc($) {
203 my $self=shift;
204 $exitfunc=shift;
205 }
206
207 # Signal handler for SIGALRM
208 sub alarm_handler() {
209 my ($sig) = @_;
210 my $rc;
211 if ($pid<0) { # No pid, nothing to do
212 logger->log_func(2,"Payload hasn't started yet\n");
213 nagstat->set_status(2,"probe killtime exceeded");
214 &$exitfunc();
215 }
216 # Either is or was a process: test status
217 logger->log_func(2,"subprocess is/was running with pid ".$pid."\n");
218 if ($wpid!=0) { # process has finished (>0)or waitpid(-1) failed
219 return;
220 }
221 # Get status
222 $wpid=waitpid($pid,WNOHANG);
223 $status=$?;
224 if ($wpid==0) { # Still running
225 if ($killtime<=$termtime || $numsent==1) {
226 logger->log_func(2,"Sending SIGKILL to ".$pid."\n");
227 kill(9,$pid);
228 nagstat->set_status(2,"probe timeout exceeded");
229 &$exitfunc();
230 }
231 logger->log_func(2,"Sending SIGTERM to ".$pid."\n");
232 kill(15,$pid);
233 $numsent=1;
234 alarm($killtime-$termtime);
235 nagstat->set_status(2,"probe critical time exceeded");
236 }
237 return;
238 }
239
240 # Signal handler for SIGINT and SIGTERM
241 sub int_handler() {
242 my ($sig)=@_;
243
244 logger->log_func(2,"Caught SIG$sig\n");
245 nagstat->set_status(2,"probe interrupted with SIG$sig");
246 if ($pid<0) { # No pid, nothing to do
247 logger->log_func(2,"Payload hasn't started yet\n");
248 &$exitfunc();
249 }
250 # Either is or was a process: test status
251 if ($wpid!=0) { # process has finished (>0)or waitpid(-1) failed
252 logger->log_func(2,"Subprocess with pid ".$pid." already finished\n");
253 &$exitfunc();
254 }
255 # Get status
256 $wpid=waitpid($pid,WNOHANG);
257 $status=$?;
258 if ($wpid==0) { # Still running: send SIGTERM
259 logger->log_func(2,"Sending SIGTERM to ".$pid."\n");
260 kill(15,$pid);
261 }
262 &$exitfunc();
263 }
264
265 # Wait for specified pid and return exitcode and signal number.
266 sub wait_probe() {
267 my $self=shift;
268 my $rc;
269 my $signo;
270
271 $wpid=waitpid($pid,0) if ($wpid<=0);
272 if ($wpid==$pid) { # probe exited here
273 alarm(0);
274 $rc=$? >> 8;
275 $signo=$? & 127;
276 } elsif ($wpid==-1) { # probe exited in sighandler
277 $rc=$status >> 8;
278 $signo=$status & 127;
279 }
280 return ($rc,$signo);
281 }
282
283 # Starts specified command: run_probe("command",\$rc,\$signo) and returns 0
284 # on normal exit, or 1 when command cannot be started.
285 # Output of command is stored as log info. Nagios status is set when
286 # applicable.
287 sub run_probe($$$) {
288 my $self=shift;
289 my $command=shift;
290 my $rc=shift;
291 my $signo=shift;
292
293 # Start command
294 $pid = open(FOO, $command." 2>&1|");
295 if (!defined($pid)) {
296 alarm(0);
297 nagstat->set_status(2,"Failed to run $command");
298 return 1;
299 }
300 while (my $line=<FOO>) {
301 logger->log_func(3,$line);
302 }
303 ($$rc,$$signo)=$self->wait_probe();
304 return 0;
305 }
306 }
307
308 ########################################################################
309 # Running main probe package
310 ########################################################################
311 package main;
312 use strict;
313 use warnings;
314
315 use Env qw(@PATH GLEXEC_LOCATION GLITE_LOCATION X509_USER_PROXY GLEXEC_CLIENT_CERT);
316 use Getopt::Long qw(:config no_ignore_case bundling);
317
318 my $timeout; # Total maximum runtime for probe
319 my $critical; # Time after which to kill gLExec
320 my $warning; # Time after which to warn about slow gLExec
321 my $payload; # Payload plus arguments: relative uses $PATH to find
322 my $verbose; # Verbosity level
323
324 # Prints usage output
325 sub usage() {
326 (my $name = $0) =~ s/.*\///;
327 print <<EOHELP;
328 Usage: $name [options]
329
330 Options:
331 -t|--timeout <timeout> maximum runtime for probe, default: $deftimeout sec
332 -w|--warning <timeout> runtime after which to warn, default: $defwarning sec
333 -c|--critical <timeout> runtime after which to probe is to be killed,
334 default: $defcritical sec
335 -x|--x509-user-proxy <file> set X509_USER_PROXY to given file
336 -g|--glexec-client-cert <file> set GLEXEC_CLIENT_CERT to given file
337 default: value of variable X509_USER_PROXY
338 -e|--execute <cmd> command to execute by gLExec
339 default: \"$defpayload\"
340 -v|--verbose be more verbose, more -v means more verbosity
341 -V|--version print version
342 --help show this helptext
343 -h show short usage information
344 EOHELP
345 exit 0;
346 }
347
348 # Prints short usage output (oneline)
349 sub shortusage() {
350 (my $name = $0) =~ s/.*\///;
351 print <<EOHELP;
352 Usage: $name [options]
353 EOHELP
354 }
355
356 # Prints probe version
357 sub version() {
358 (my $name = $0) =~ s/.*\///;
359 print <<EOHELP;
360 $name version: $probeversion
361 EOHELP
362 }
363
364 # Parses command line options and sets global variables
365 sub getopts() {
366 my $x509proxy;
367 my $clientcert;
368 my $version;
369 my $help;
370 my $shorthelp;
371
372 $timeout=$deftimeout;
373 $critical=$defcritical;
374 $warning=$defwarning;
375 $payload=$defpayload;
376 GetOptions(
377 "t|timeout=i" => \$timeout,
378 "c|critical=i" => \$critical,
379 "w|warning=i" => \$warning,
380 "x|x509-user-proxy=s" => \$X509_USER_PROXY,
381 "g|glexec-client-cert=s" => \$GLEXEC_CLIENT_CERT,
382 "e|execute=s" => \$payload,
383 "v|verbose+" => \$verbose,
384 "help+" => \$help,
385 "h+" => \$shorthelp,
386 "V|version+" => \$version,
387 "H|host",
388 "p|port"
389 ) or &usage and exit(1);
390
391 $help and &usage and exit(0);
392 $shorthelp and &shortusage and exit(0);
393 $version and &version and exit(0);
394 if (!defined $GLEXEC_CLIENT_CERT) {
395 $GLEXEC_CLIENT_CERT=$X509_USER_PROXY;
396 }
397 }
398
399 # Exit function: prints nagios status and dumps log
400 sub nagios_exit() {
401 my $rc=nagstat->get_status();
402
403 # Logging object
404 logger->get_log();
405
406 exit $rc;
407 }
408
409 # Finds gLExec in path and pre-specified directories
410 sub find_glexec {
411 my $self=shift;
412 my $glexloc;
413 my $dir;
414 my @DEFAULT_PATH=("/usr/local/sbin","/usr/sbin","/sbin",
415 "/usr/local/bin","/usr/bin");
416
417 # Try GLEXEC_LOCATION
418 if (defined $GLEXEC_LOCATION) {
419 logger->log_func(3,"GLEXEC_LOCATION=".$GLEXEC_LOCATION."\n");
420 $glexloc=$GLEXEC_LOCATION."/sbin/glexec";
421 if (-x $glexloc) {
422 logger->log_func(2,"gLExec found at ".$glexloc."\n");
423 return $glexloc;
424 }
425 logger->log_func(2,"gLExec NOT found at \$GLEXEC_LOCATION\n");
426 }
427
428 # Try GLITE_LOCATION
429 $GLITE_LOCATION="/opt/glite" if (!defined $GLITE_LOCATION);
430 logger->log_func(3,"GLITE_LOCATION=".$GLITE_LOCATION."\n");
431
432 @PATH=(".") if (!$PATH[1]);
433
434 for $dir (@PATH,$GLITE_LOCATION."/sbin",@DEFAULT_PATH) {
435 logger->log_func(3,"Looking for glexec in ".$dir."\n");
436 $glexloc=$dir."/glexec";
437 if (-x $glexloc) {
438 logger->log_func(2,"gLExec found at ".$glexloc."\n");
439 return $glexloc;
440 }
441 }
442 return undef;
443 }
444
445 sub glexec_to_nagios($$$) {
446 my $rc=shift;
447 my $signo=shift;
448 my $dt=shift;
449
450 if ($rc==0) {
451 nagstat->set_perfdata("${dt}s;$warning;$critical;0");
452 if ($dt>=$warning) {
453 nagstat->set_status(1,"gLExec took long time to succeed");
454 return 0;
455 } else {
456 nagstat->set_status(0,"Success");
457 return 1;
458 }
459 } elsif ($rc==126) {
460 nagstat->set_status(1,"executable $payload can't be executed ($rc)");
461 } elsif ($rc==201) {
462 nagstat->set_status(2,"client error ($rc)");
463 } elsif ($rc==202) {
464 nagstat->set_status(2,"system error ($rc)");
465 } elsif ($rc==203) {
466 nagstat->set_status(2,"authorization error ($rc)");
467 } elsif ($rc==204) {
468 nagstat->set_status(2,"exit code overlap error ($rc)");
469 } elsif ($signo!=0) {
470 nagstat->set_status(2,"exit due to signal $signo ($rc)");
471 } else {
472 nagstat->set_status(2,
473 "executable $payload failed with non-zero exit code ($rc)");
474 }
475 return 1;
476 }
477
478 # Find gLExec command, payload command (when relative), runs it and returns
479 # status
480 sub run_glexec() {
481 my $glexec;
482 my $exitcode;
483 my $signo;
484 my $t1;
485 my $t2;
486
487 # Make sure to have starttime
488 $t1=time();
489
490 # Set alarm before looking for gLExec to prevent NFS timeouts
491 alarm($critical);
492
493 # Find glexec command
494 if (!defined ($glexec=find_glexec)) {
495 nagstat->set_status(2,"glexec command not found");
496 return 1;
497 }
498
499 # Check proxies
500 if (!defined $X509_USER_PROXY) {
501 nagstat->set_status(3,"\$X509_USER_PROXY is unset.");
502 return 1;
503 }
504 if (! -e $X509_USER_PROXY || ! -s $X509_USER_PROXY) {
505 nagstat->set_status(3,
506 "\$X509_USER_PROXY does not point to a nonempty file.");
507 return 1;
508 }
509 if (! -e $GLEXEC_CLIENT_CERT || ! -s $GLEXEC_CLIENT_CERT) {
510 nagstat->set_status(3,
511 "\$GLEXEC_CLIENT_CERT does not point to a nonempty file.");
512 return 1;
513 }
514
515 # Find full path for payload if it's relative
516 if ($payload !~ /^\/.*/) {
517 (my $name=$payload) =~ s/ .*//;
518 my $fullname;
519 for my $dir (@PATH) {
520 logger->log_func(3,"Looking for ".$name." in ".$dir."\n");
521 $fullname=$dir."/".$name;
522 if (-x $fullname) {
523 ($payload=$payload) =~ s/^$name/$fullname/;
524 logger->log_func(2,"Payload set to ".$payload."\n");
525 last;
526 }
527 }
528 }
529
530 # Run actual probe in child process
531 if (probeipc->run_probe("$glexec $payload",\$exitcode,\$signo)!=0) {
532 return 1;
533 }
534
535 # Probe exited: find exit status
536 $t2=time();
537 return glexec_to_nagios($exitcode,$signo,$t2-$t1);
538 }
539
540 # Parse commandline options
541 getopts();
542
543 # Initialize logger and set loglevel
544 logger->new($verbose);
545
546 # Initialize objects
547 nagstat->new();
548
549 # Initialize signal handling
550 probeipc->new(\&nagios_exit,$timeout,$critical);
551
552 # run actual gLExec probe
553 run_glexec();
554
555 # Dump nagios status, log and exit
556 nagios_exit();
557

Properties

Name Value
svn:executable *

grid.support@nikhef.nl
ViewVC Help
Powered by ViewVC 1.1.28