/[pdpsoft]/tags/nagios_glexec_R_0_2/check_glexec
ViewVC logotype

Contents of /tags/nagios_glexec_R_0_2/check_glexec

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2463 - (show annotations) (download)
Tue Dec 6 16:14:25 2011 UTC (10 years, 1 month ago) by msalle
File size: 14881 byte(s)
Nagios plugins for gLExec (vers. 0.2) and EES (vers 0.1)

1 #!/usr/bin/perl
2 #
3 # Copyright (C) Nikhef 2011
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 #
17 # Author:
18 # Mischa Sall\'e <msalle@nikhef.nl>
19 # NIKHEF Amsterdam, the Netherlands
20 #
21 ########################################################################
22 #
23 # Nagios probe to test functioning of gLExec
24 #
25 # Nagios state can be one of the following:
26 # - Missing glexec command: CRITICAL
27 # - input proxies empty: UNKNOWN
28 # - short timeout exceeded: WARNING
29 # - timeout exceeded: CRITICAL
30 # - gLExec exit codes:
31 # 0 glexec succeeded: OK
32 # 201 Client error: CRITICAL
33 # 202 Internal error: CRITICAL
34 # 203 Auth error: CRITICAL
35 # 204 Overlap: CRITICAL
36 # 126 execve failed: WARNING
37 # 128+n signal: WARNING
38 # !=0 rc of payload: WARNING
39 #
40 ########################################################################
41
42 # DEFAULTS
43 my $probeversion=0.2;
44
45 # Note the following defaults can be overridden using cmdline options
46 my $deftimeout=10; # Overall timeout for probe
47 my $defcritical=8; # When to send SIGTERM
48 my $defwarning=5; # When to warn about slow running
49 my $defpayload="id -a"; # Which payload to run
50
51 ########################################################################
52 # Logging package
53 # keeps internal log trace which can be dumped with dump_log
54 ########################################################################
55 package logger;
56 use strict;
57 use warnings;
58 {
59 my $loglevel;
60 my @logstring;
61
62 # Constructor
63 sub new {
64 my $classname=shift;
65 my $self={}; bless $self;
66 my $level=shift;
67 if (defined $level) {
68 $self->set_loglevel($level);
69 } else {
70 $loglevel=0;
71 }
72 return $self;
73 }
74
75 # Sets loglevel
76 sub set_loglevel($) {
77 my $self=shift;
78 my $level=shift;
79 $loglevel=$level;
80 }
81
82 # Logging function: log_func(priority, "logstring\n");
83 sub log_func($@) {
84 my $self=shift;
85 my $prio=shift;
86 return if ($prio > $loglevel);
87 for my $line (@_) {
88 push @logstring,$line;
89 }
90 }
91
92 # Dumps log
93 sub get_log(@) {
94 my $self=shift;
95 foreach my $myentry ( @logstring ) {
96 print $myentry;
97 }
98 }
99 }
100
101 ########################################################################
102 # Nagios status printing package
103 # Can set and dump nagios status output
104 ########################################################################
105 package nagstat;
106 {
107 my $code;
108 my $summary;
109 my $perfdata;
110 my @stat;
111
112 # Constructor
113 sub new() {
114 my $classname=shift;
115 my $self={}; bless $self;
116 $code=3; # Default status unknown
117 $summary=undef;
118 $perfdata=undef;
119 @stat=("OK","WARNING","CRITICAL","UNKNOWN");
120 return $self;
121 }
122
123 # Set nagios code (0-3) plus summary
124 sub set_status($$) {
125 my $self=shift;
126 if (!defined $summary) {
127 $code=shift;
128 $summary=shift;
129 }
130 }
131
132 # Set internal performance data
133 sub set_perfdata($) {
134 my $self=shift;
135 $perfdata=shift;
136 }
137
138 # Printout nagios status, summary and optionally performance data
139 # return value is code (0-3)
140 sub get_status {
141 if (!defined $summary) {
142 $summary="unknown status";
143 }
144 if (defined $perfdata) {
145 print $stat[$code].": ".$summary."|".$perfdata."\n";
146 } else {
147 print $stat[$code].": ".$summary."\n";
148 }
149 return $code;
150 }
151 }
152
153 ########################################################################
154 # Inter process communication package for nagios probes
155 # Starts alarm handler when receiving alarm which checks status of
156 # probe, and terminates or kills it.
157 ########################################################################
158 package probeipc;
159 use POSIX ":sys_wait_h";
160 use Time::HiRes qw(alarm);
161 {
162 my $pid;
163 my $wpid;
164 my $status;
165 my $numsent;
166 my $killtime;
167 my $termtime;
168 my $exitfunc;
169
170 # Constructor: new(exitfunc,[kill time], [term time])
171 sub new() {
172 my $classname=shift;
173 my $self={}; bless $self;
174 my $exitfunc=shift or die ($classname."::new() needs exitfunc arg\n");
175 my $killtime=(shift or 10); # probe default timeout is 10
176 my $termtime=(shift or $killtime);
177 $self->set_exitfunc($exitfunc);
178 $self->set_killtime($killtime);
179 $self->set_termtime($termtime);
180 $pid=-1;
181 $wpid=0;
182 $status=0;
183 $numsent=0;
184 $SIG{'ALRM'} = \&alarm_handler;
185 $SIG{'INT'} = \&int_handler;
186 $SIG{'TERM'} = \&int_handler;
187 return $self;
188 }
189
190 # Sets time after which to send SIGKILL
191 sub set_killtime($) {
192 my $self=shift;
193 $killtime=shift;
194 }
195
196 # Sets time after which to send SIGTERM
197 sub set_termtime($) {
198 my $self=shift;
199 $termtime=shift;
200 }
201
202 # Sets function to call when exiting after sending a SIGKILL
203 sub set_exitfunc($) {
204 my $self=shift;
205 $exitfunc=shift;
206 }
207
208 # Signal handler for SIGALRM
209 sub alarm_handler() {
210 my ($sig) = @_;
211 my $rc;
212 if ($pid<0) { # No pid, nothing to do
213 logger->log_func(2,"Payload hasn't started yet\n");
214 nagstat->set_status(2,"probe killtime exceeded");
215 &$exitfunc();
216 }
217 # Either is or was a process: test status
218 logger->log_func(2,"subprocess is/was running with pid ".$pid."\n");
219 if ($wpid!=0) { # process has finished (>0)or waitpid(-1) failed
220 return;
221 }
222 # Get status
223 $wpid=waitpid($pid,WNOHANG);
224 $status=$?;
225 if ($wpid==0) { # Still running
226 if ($killtime<=$termtime || $numsent==1) {
227 logger->log_func(2,"Sending SIGKILL to ".$pid."\n");
228 kill(9,$pid);
229 nagstat->set_status(2,"probe timeout exceeded");
230 &$exitfunc();
231 }
232 logger->log_func(2,"Sending SIGTERM to ".$pid."\n");
233 kill(15,$pid);
234 $numsent=1;
235 alarm($killtime-$termtime);
236 nagstat->set_status(2,"probe critical time exceeded");
237 }
238 return;
239 }
240
241 # Signal handler for SIGINT and SIGTERM
242 sub int_handler() {
243 my ($sig)=@_;
244
245 logger->log_func(2,"Caught SIG$sig\n");
246 nagstat->set_status(2,"probe interrupted with SIG$sig");
247 if ($pid<0) { # No pid, nothing to do
248 logger->log_func(2,"Payload hasn't started yet\n");
249 &$exitfunc();
250 }
251 # Either is or was a process: test status
252 if ($wpid!=0) { # process has finished (>0)or waitpid(-1) failed
253 logger->log_func(2,"Subprocess with pid ".$pid." already finished\n");
254 &$exitfunc();
255 }
256 # Get status
257 $wpid=waitpid($pid,WNOHANG);
258 $status=$?;
259 if ($wpid==0) { # Still running: send SIGTERM
260 logger->log_func(2,"Sending SIGTERM to ".$pid."\n");
261 kill(15,$pid);
262 }
263 &$exitfunc();
264 }
265
266 # Wait for specified pid and return exitcode and signal number.
267 sub wait_probe() {
268 my $self=shift;
269 my $rc;
270 my $signo;
271
272 $wpid=waitpid($pid,0) if ($wpid<=0);
273 if ($wpid==$pid) { # probe exited here
274 alarm(0);
275 $rc=$? >> 8;
276 $signo=$? & 127;
277 } elsif ($wpid==-1) { # probe exited in sighandler
278 $rc=$status >> 8;
279 $signo=$status & 127;
280 }
281 return ($rc,$signo);
282 }
283
284 # Starts specified command: run_probe("command",\$rc,\$signo) and returns 0
285 # on normal exit, or 1 when command cannot be started.
286 # Output of command is stored as log info. Nagios status is set when
287 # applicable.
288 sub run_probe($$$) {
289 my $self=shift;
290 my $command=shift;
291 my $rc=shift;
292 my $signo=shift;
293
294 # Start command
295 $pid = open(FOO, $command." 2>&1|");
296 if (!defined($pid)) {
297 alarm(0);
298 nagstat->set_status(2,"Failed to run $command");
299 return 1;
300 }
301 while (my $line=<FOO>) {
302 logger->log_func(3,$line);
303 }
304 ($$rc,$$signo)=$self->wait_probe();
305 return 0;
306 }
307 }
308
309 ########################################################################
310 # Running main probe package
311 ########################################################################
312 package main;
313 use strict;
314 use warnings;
315
316 use Env qw(@PATH GLEXEC_LOCATION GLITE_LOCATION X509_USER_PROXY GLEXEC_CLIENT_CERT);
317 use Getopt::Long qw(:config no_ignore_case bundling);
318 use Time::HiRes qw(time alarm);
319
320 my $timeout; # Total maximum runtime for probe
321 my $critical; # Time after which to kill gLExec
322 my $warning; # Time after which to warn about slow gLExec
323 my $payload; # Payload plus arguments: relative uses $PATH to find
324 my $verbose; # Verbosity level
325
326 # Prints usage output
327 sub usage() {
328 (my $name = $0) =~ s/.*\///;
329 print <<EOHELP;
330 Usage: $name [options]
331
332 Options:
333 -t|--timeout <timeout> maximum runtime for probe, default: $deftimeout sec
334 -w|--warning <timeout> runtime after which to warn, default: $defwarning sec
335 -c|--critical <timeout> runtime after which to probe is to be killed,
336 default: $defcritical sec
337 -x|--x509-user-proxy <file> set X509_USER_PROXY to given file
338 -g|--glexec-client-cert <file> set GLEXEC_CLIENT_CERT to given file
339 default: value of variable X509_USER_PROXY
340 -e|--execute <cmd> command to execute by gLExec
341 default: \"$defpayload\"
342 -v|--verbose be more verbose, more -v means more verbosity
343 -V|--version print version
344 --help show this helptext
345 -h show short usage information
346 EOHELP
347 exit 0;
348 }
349
350 # Prints short usage output (oneline)
351 sub shortusage() {
352 (my $name = $0) =~ s/.*\///;
353 print <<EOHELP;
354 Usage: $name [options]
355 EOHELP
356 }
357
358 # Prints probe version
359 sub version() {
360 (my $name = $0) =~ s/.*\///;
361 print <<EOHELP;
362 $name version: $probeversion
363 EOHELP
364 }
365
366 # Parses command line options and sets global variables
367 sub getopts() {
368 my $x509proxy;
369 my $clientcert;
370 my $version;
371 my $help;
372 my $shorthelp;
373
374 $timeout=$deftimeout;
375 $critical=$defcritical;
376 $warning=$defwarning;
377 $payload=$defpayload;
378 GetOptions(
379 "t|timeout=f" => \$timeout,
380 "c|critical=f" => \$critical,
381 "w|warning=f" => \$warning,
382 "x|x509-user-proxy=s" => \$X509_USER_PROXY,
383 "g|glexec-client-cert=s" => \$GLEXEC_CLIENT_CERT,
384 "e|execute=s" => \$payload,
385 "v|verbose+" => \$verbose,
386 "help+" => \$help,
387 "h+" => \$shorthelp,
388 "V|version+" => \$version,
389 "H|host",
390 "p|port",
391 "u|url"
392 ) or &usage and exit(1);
393
394 $help and &usage and exit(0);
395 $shorthelp and &shortusage and exit(0);
396 $version and &version and exit(0);
397 if (!defined $GLEXEC_CLIENT_CERT) {
398 $GLEXEC_CLIENT_CERT=$X509_USER_PROXY;
399 }
400 $timeout=0 if ($timeout<0);
401 $critical=0 if ($critical<0);
402 $warning=0 if ($warning<0);
403 $critical=$timeout if ($timeout<$critical);
404 }
405
406 # Exit function: prints nagios status and dumps log
407 sub nagios_exit() {
408 my $rc=nagstat->get_status();
409
410 # Logging object
411 logger->get_log();
412
413 exit $rc;
414 }
415
416 # Finds gLExec in path and pre-specified directories
417 sub find_glexec {
418 my $self=shift;
419 my $glexloc;
420 my $dir;
421 my @DEFAULT_PATH=("/usr/local/sbin","/usr/sbin","/sbin",
422 "/usr/local/bin","/usr/bin");
423
424 # Try GLEXEC_LOCATION
425 if (defined $GLEXEC_LOCATION) {
426 logger->log_func(3,"GLEXEC_LOCATION=".$GLEXEC_LOCATION."\n");
427 $glexloc=$GLEXEC_LOCATION."/sbin/glexec";
428 if (-x $glexloc) {
429 logger->log_func(2,"gLExec found at ".$glexloc."\n");
430 return $glexloc;
431 }
432 logger->log_func(2,"gLExec NOT found at \$GLEXEC_LOCATION\n");
433 }
434
435 # Try GLITE_LOCATION
436 $GLITE_LOCATION="/opt/glite" if (!defined $GLITE_LOCATION);
437 logger->log_func(3,"GLITE_LOCATION=".$GLITE_LOCATION."\n");
438
439 @PATH=(".") if (!$PATH[1]);
440
441 for $dir (@PATH,$GLITE_LOCATION."/sbin",@DEFAULT_PATH) {
442 logger->log_func(3,"Looking for glexec in ".$dir."\n");
443 $glexloc=$dir."/glexec";
444 if (-x $glexloc) {
445 logger->log_func(2,"gLExec found at ".$glexloc."\n");
446 return $glexloc;
447 }
448 }
449 return undef;
450 }
451
452 sub glexec_to_nagios($$$) {
453 my $rc=shift;
454 my $signo=shift;
455 my $dt=shift;
456
457 if ($rc==0) {
458 nagstat->set_perfdata("${dt}s;$warning;$critical;0");
459 if ($dt>=$warning) {
460 nagstat->set_status(1,"gLExec took long time to succeed");
461 return 0;
462 } else {
463 nagstat->set_status(0,"Success");
464 return 1;
465 }
466 } elsif ($rc==126) {
467 nagstat->set_status(1,"executable $payload can't be executed ($rc)");
468 } elsif ($rc==201) {
469 nagstat->set_status(2,"client error ($rc)");
470 } elsif ($rc==202) {
471 nagstat->set_status(2,"system error ($rc)");
472 } elsif ($rc==203) {
473 nagstat->set_status(2,"authorization error ($rc)");
474 } elsif ($rc==204) {
475 nagstat->set_status(2,"exit code overlap error ($rc)");
476 } elsif ($signo!=0) {
477 nagstat->set_status(2,"exit due to signal $signo ($rc)");
478 } else {
479 nagstat->set_status(2,
480 "executable $payload failed with non-zero exit code ($rc)");
481 }
482 return 1;
483 }
484
485 # Find gLExec command, payload command (when relative), runs it and returns
486 # status
487 sub run_glexec() {
488 my $glexec;
489 my $exitcode;
490 my $signo;
491 my $t1;
492 my $t2;
493
494 # Make sure to have starttime
495 $t1=time();
496
497 # Set alarm before looking for gLExec to prevent NFS timeouts
498 alarm($critical);
499
500 # Find glexec command
501 if (!defined ($glexec=find_glexec)) {
502 nagstat->set_status(2,"glexec command not found");
503 return 1;
504 }
505
506 # Check proxies
507 if (!defined $X509_USER_PROXY) {
508 nagstat->set_status(3,"\$X509_USER_PROXY is unset.");
509 return 1;
510 }
511 if (! -e $X509_USER_PROXY || ! -s $X509_USER_PROXY) {
512 nagstat->set_status(3,
513 "\$X509_USER_PROXY does not point to a nonempty file.");
514 return 1;
515 }
516 if (! -e $GLEXEC_CLIENT_CERT || ! -s $GLEXEC_CLIENT_CERT) {
517 nagstat->set_status(3,
518 "\$GLEXEC_CLIENT_CERT does not point to a nonempty file.");
519 return 1;
520 }
521
522 # Find full path for payload if it's relative
523 if ($payload !~ /^\/.*/) {
524 (my $name=$payload) =~ s/ .*//;
525 my $fullname;
526 for my $dir (@PATH) {
527 logger->log_func(3,"Looking for ".$name." in ".$dir."\n");
528 $fullname=$dir."/".$name;
529 if (-x $fullname) {
530 ($payload=$payload) =~ s/^$name/$fullname/;
531 logger->log_func(2,"Payload set to ".$payload."\n");
532 last;
533 }
534 }
535 }
536
537 # Run actual probe in child process
538 if (probeipc->run_probe("$glexec $payload",\$exitcode,\$signo)!=0) {
539 return 1;
540 }
541
542 # Probe exited: find exit status
543 $t2=time();
544 my $dt=int(($t2-$t1)*1000+0.5)/1000;
545 return glexec_to_nagios($exitcode,$signo,$dt);
546 }
547
548 # Parse commandline options
549 getopts();
550
551 # Initialize logger and set loglevel
552 logger->new($verbose);
553
554 # Initialize objects
555 nagstat->new();
556
557 # Initialize signal handling
558 probeipc->new(\&nagios_exit,$timeout,$critical);
559
560 # run actual gLExec probe
561 run_glexec();
562
563 # Dump nagios status, log and exit
564 nagios_exit();
565

Properties

Name Value
svn:executable *

grid.support@nikhef.nl
ViewVC Help
Powered by ViewVC 1.1.28