/[pdpsoft]/trunk/nagios/glexec/check_glexec.in
ViewVC logotype

Contents of /trunk/nagios/glexec/check_glexec.in

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2590 - (show annotations) (download)
Tue Jan 15 16:37:50 2013 UTC (9 years, 8 months ago) by msalle
File size: 14949 byte(s)
Bugfix: in order to print the version, it needs to be a string, as it contains
multiple dots.

1 #!@PERL@
2 #
3 # Copyright (C) Nikhef 2011
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 #
17 # Author:
18 # Mischa Sall\'e <msalle@nikhef.nl>
19 # NIKHEF Amsterdam, the Netherlands
20 #
21 ########################################################################
22 #
23 # Nagios probe to test functioning of gLExec
24 #
25 # Nagios state can be one of the following:
26 # - Missing glexec command: CRITICAL
27 # - input proxies empty: UNKNOWN
28 # - short timeout exceeded: WARNING
29 # - timeout exceeded: CRITICAL
30 # - gLExec exit codes:
31 # 0 glexec succeeded: OK
32 # 201 Client error: CRITICAL
33 # 202 Internal error: CRITICAL
34 # 203 Auth error: CRITICAL
35 # 204 Overlap: CRITICAL
36 # 126 execve failed: WARNING
37 # 128+n signal: WARNING
38 # !=0 rc of payload: WARNING
39 #
40 ########################################################################
41
42 # DEFAULTS
43 # Note: version contains multiple dots, and hence is a string
44 my $probeversion="@VERSION@";
45
46 # Note the following defaults can be overridden using cmdline options
47 my $deftimeout=10; # Overall timeout for probe
48 my $defcritical=8; # When to send SIGTERM
49 my $defwarning=5; # When to warn about slow running
50 my $defpayload="id -a"; # Which payload to run
51
52 ########################################################################
53 # Logging package
54 # keeps internal log trace which can be dumped with dump_log
55 ########################################################################
56 package logger;
57 use strict;
58 use warnings;
59 {
60 my $loglevel;
61 my @logstring;
62
63 # Constructor
64 sub new {
65 my $classname=shift;
66 my $self={}; bless $self;
67 my $level=shift;
68 if (defined $level) {
69 $self->set_loglevel($level);
70 } else {
71 $loglevel=0;
72 }
73 return $self;
74 }
75
76 # Sets loglevel
77 sub set_loglevel($) {
78 my $self=shift;
79 my $level=shift;
80 $loglevel=$level;
81 }
82
83 # Logging function: log_func(priority, "logstring\n");
84 sub log_func($@) {
85 my $self=shift;
86 my $prio=shift;
87 return if ($prio > $loglevel);
88 for my $line (@_) {
89 push @logstring,$line;
90 }
91 }
92
93 # Dumps log
94 sub get_log(@) {
95 my $self=shift;
96 foreach my $myentry ( @logstring ) {
97 print $myentry;
98 }
99 }
100 }
101
102 ########################################################################
103 # Nagios status printing package
104 # Can set and dump nagios status output
105 ########################################################################
106 package nagstat;
107 {
108 my $code;
109 my $summary;
110 my $perfdata;
111 my @stat;
112
113 # Constructor
114 sub new() {
115 my $classname=shift;
116 my $self={}; bless $self;
117 $code=3; # Default status unknown
118 $summary=undef;
119 $perfdata=undef;
120 @stat=("OK","WARNING","CRITICAL","UNKNOWN");
121 return $self;
122 }
123
124 # Set nagios code (0-3) plus summary
125 sub set_status($$) {
126 my $self=shift;
127 if (!defined $summary) {
128 $code=shift;
129 $summary=shift;
130 }
131 }
132
133 # Set internal performance data
134 sub set_perfdata($) {
135 my $self=shift;
136 $perfdata=shift;
137 }
138
139 # Printout nagios status, summary and optionally performance data
140 # return value is code (0-3)
141 sub get_status {
142 if (!defined $summary) {
143 $summary="unknown status";
144 }
145 if (defined $perfdata) {
146 print $stat[$code].": ".$summary."|".$perfdata."\n";
147 } else {
148 print $stat[$code].": ".$summary."\n";
149 }
150 return $code;
151 }
152 }
153
154 ########################################################################
155 # Inter process communication package for nagios probes
156 # Starts alarm handler when receiving alarm which checks status of
157 # probe, and terminates or kills it.
158 ########################################################################
159 package probeipc;
160 use POSIX ":sys_wait_h";
161 use Time::HiRes qw(alarm);
162 {
163 my $pid;
164 my $wpid;
165 my $status;
166 my $numsent;
167 my $killtime;
168 my $termtime;
169 my $exitfunc;
170
171 # Constructor: new(exitfunc,[kill time], [term time])
172 sub new() {
173 my $classname=shift;
174 my $self={}; bless $self;
175 my $exitfunc=shift or die ($classname."::new() needs exitfunc arg\n");
176 my $killtime=(shift or 10); # probe default timeout is 10
177 my $termtime=(shift or $killtime);
178 $self->set_exitfunc($exitfunc);
179 $self->set_killtime($killtime);
180 $self->set_termtime($termtime);
181 $pid=-1;
182 $wpid=0;
183 $status=0;
184 $numsent=0;
185 $SIG{'ALRM'} = \&alarm_handler;
186 $SIG{'INT'} = \&int_handler;
187 $SIG{'TERM'} = \&int_handler;
188 return $self;
189 }
190
191 # Sets time after which to send SIGKILL
192 sub set_killtime($) {
193 my $self=shift;
194 $killtime=shift;
195 }
196
197 # Sets time after which to send SIGTERM
198 sub set_termtime($) {
199 my $self=shift;
200 $termtime=shift;
201 }
202
203 # Sets function to call when exiting after sending a SIGKILL
204 sub set_exitfunc($) {
205 my $self=shift;
206 $exitfunc=shift;
207 }
208
209 # Signal handler for SIGALRM
210 sub alarm_handler() {
211 my ($sig) = @_;
212 my $rc;
213 if ($pid<0) { # No pid, nothing to do
214 logger->log_func(2,"Payload hasn't started yet\n");
215 nagstat->set_status(2,"probe killtime exceeded");
216 &$exitfunc();
217 }
218 # Either is or was a process: test status
219 logger->log_func(2,"subprocess is/was running with pid ".$pid."\n");
220 if ($wpid!=0) { # process has finished (>0)or waitpid(-1) failed
221 return;
222 }
223 # Get status
224 $wpid=waitpid($pid,WNOHANG);
225 $status=$?;
226 if ($wpid==0) { # Still running
227 if ($killtime<=$termtime || $numsent==1) {
228 logger->log_func(2,"Sending SIGKILL to ".$pid."\n");
229 kill(9,$pid);
230 nagstat->set_status(2,"probe timeout exceeded");
231 &$exitfunc();
232 }
233 logger->log_func(2,"Sending SIGTERM to ".$pid."\n");
234 kill(15,$pid);
235 $numsent=1;
236 alarm($killtime-$termtime);
237 nagstat->set_status(2,"probe critical time exceeded");
238 }
239 return;
240 }
241
242 # Signal handler for SIGINT and SIGTERM
243 sub int_handler() {
244 my ($sig)=@_;
245
246 logger->log_func(2,"Caught SIG$sig\n");
247 nagstat->set_status(2,"probe interrupted with SIG$sig");
248 if ($pid<0) { # No pid, nothing to do
249 logger->log_func(2,"Payload hasn't started yet\n");
250 &$exitfunc();
251 }
252 # Either is or was a process: test status
253 if ($wpid!=0) { # process has finished (>0)or waitpid(-1) failed
254 logger->log_func(2,"Subprocess with pid ".$pid." already finished\n");
255 &$exitfunc();
256 }
257 # Get status
258 $wpid=waitpid($pid,WNOHANG);
259 $status=$?;
260 if ($wpid==0) { # Still running: send SIGTERM
261 logger->log_func(2,"Sending SIGTERM to ".$pid."\n");
262 kill(15,$pid);
263 }
264 &$exitfunc();
265 }
266
267 # Wait for specified pid and return exitcode and signal number.
268 sub wait_probe() {
269 my $self=shift;
270 my $rc;
271 my $signo;
272
273 $wpid=waitpid($pid,0) if ($wpid<=0);
274 if ($wpid==$pid) { # probe exited here
275 alarm(0);
276 $rc=$? >> 8;
277 $signo=$? & 127;
278 } elsif ($wpid==-1) { # probe exited in sighandler
279 $rc=$status >> 8;
280 $signo=$status & 127;
281 }
282 return ($rc,$signo);
283 }
284
285 # Starts specified command: run_probe("command",\$rc,\$signo) and returns 0
286 # on normal exit, or 1 when command cannot be started.
287 # Output of command is stored as log info. Nagios status is set when
288 # applicable.
289 sub run_probe($$$) {
290 my $self=shift;
291 my $command=shift;
292 my $rc=shift;
293 my $signo=shift;
294
295 # Start command
296 $pid = open(FOO, $command." 2>&1|");
297 if (!defined($pid)) {
298 alarm(0);
299 nagstat->set_status(2,"Failed to run $command");
300 return 1;
301 }
302 while (my $line=<FOO>) {
303 logger->log_func(3,$line);
304 }
305 ($$rc,$$signo)=$self->wait_probe();
306 return 0;
307 }
308 }
309
310 ########################################################################
311 # Running main probe package
312 ########################################################################
313 package main;
314 use strict;
315 use warnings;
316
317 use Env qw(@PATH GLEXEC_LOCATION GLITE_LOCATION X509_USER_PROXY GLEXEC_CLIENT_CERT);
318 use Getopt::Long qw(:config no_ignore_case bundling);
319 use Time::HiRes qw(time alarm);
320
321 my $timeout; # Total maximum runtime for probe
322 my $critical; # Time after which to kill gLExec
323 my $warning; # Time after which to warn about slow gLExec
324 my $payload; # Payload plus arguments: relative uses $PATH to find
325 my $verbose; # Verbosity level
326
327 # Prints usage output
328 sub usage() {
329 (my $name = $0) =~ s/.*\///;
330 print <<EOHELP;
331 Usage: $name [options]
332
333 Options:
334 -t|--timeout <timeout> maximum runtime for probe, default: $deftimeout sec
335 -w|--warning <timeout> runtime after which to warn, default: $defwarning sec
336 -c|--critical <timeout> runtime after which the probe is to be killed,
337 default: $defcritical sec
338 -x|--x509-user-proxy <file> set X509_USER_PROXY to given file
339 -g|--glexec-client-cert <file> set GLEXEC_CLIENT_CERT to given file
340 default: value of variable X509_USER_PROXY
341 -e|--execute <cmd> command to be executed by gLExec
342 default: \"$defpayload\"
343 -v|--verbose be more verbose, more -v means more verbosity
344 -V|--version print version
345 --help show this helptext
346 -h show short usage information
347 EOHELP
348 exit 0;
349 }
350
351 # Prints short usage output (oneline)
352 sub shortusage() {
353 (my $name = $0) =~ s/.*\///;
354 print <<EOHELP;
355 Usage: $name [options]
356 EOHELP
357 }
358
359 # Prints probe version
360 sub version() {
361 (my $name = $0) =~ s/.*\///;
362 print <<EOHELP;
363 $name version: $probeversion
364 EOHELP
365 }
366
367 # Parses command line options and sets global variables
368 sub getopts() {
369 my $x509proxy;
370 my $clientcert;
371 my $version;
372 my $help;
373 my $shorthelp;
374
375 $timeout=$deftimeout;
376 $critical=$defcritical;
377 $warning=$defwarning;
378 $payload=$defpayload;
379 GetOptions(
380 "t|timeout=f" => \$timeout,
381 "c|critical=f" => \$critical,
382 "w|warning=f" => \$warning,
383 "x|x509-user-proxy=s" => \$X509_USER_PROXY,
384 "g|glexec-client-cert=s" => \$GLEXEC_CLIENT_CERT,
385 "e|execute=s" => \$payload,
386 "v|verbose+" => \$verbose,
387 "help+" => \$help,
388 "h+" => \$shorthelp,
389 "V|version+" => \$version,
390 "H|host",
391 "p|port",
392 "u|url"
393 ) or &usage and exit(1);
394
395 $help and &usage and exit(0);
396 $shorthelp and &shortusage and exit(0);
397 $version and &version and exit(0);
398 if (!defined $GLEXEC_CLIENT_CERT) {
399 $GLEXEC_CLIENT_CERT=$X509_USER_PROXY;
400 }
401 $timeout=0 if ($timeout<0);
402 $critical=0 if ($critical<0);
403 $warning=0 if ($warning<0);
404 $critical=$timeout if ($timeout<$critical);
405 }
406
407 # Exit function: prints nagios status and dumps log
408 sub nagios_exit() {
409 my $rc=nagstat->get_status();
410
411 # Logging object
412 logger->get_log();
413
414 exit $rc;
415 }
416
417 # Finds gLExec in path and pre-specified directories
418 sub find_glexec {
419 my $self=shift;
420 my $glexloc;
421 my $dir;
422 my @DEFAULT_PATH=("/usr/local/sbin","/usr/sbin","/sbin",
423 "/usr/local/bin","/usr/bin");
424
425 # Try GLEXEC_LOCATION
426 if (defined $GLEXEC_LOCATION) {
427 logger->log_func(3,"GLEXEC_LOCATION=".$GLEXEC_LOCATION."\n");
428 $glexloc=$GLEXEC_LOCATION."/sbin/glexec";
429 if (-x $glexloc) {
430 logger->log_func(2,"gLExec found at ".$glexloc."\n");
431 return $glexloc;
432 }
433 logger->log_func(2,"gLExec NOT found at \$GLEXEC_LOCATION\n");
434 }
435
436 # Try GLITE_LOCATION
437 $GLITE_LOCATION="/opt/glite" if (!defined $GLITE_LOCATION);
438 logger->log_func(3,"GLITE_LOCATION=".$GLITE_LOCATION."\n");
439
440 @PATH=(".") if (!$PATH[1]);
441
442 for $dir (@PATH,$GLITE_LOCATION."/sbin",@DEFAULT_PATH) {
443 logger->log_func(3,"Looking for glexec in ".$dir."\n");
444 $glexloc=$dir."/glexec";
445 if (-x $glexloc) {
446 logger->log_func(2,"gLExec found at ".$glexloc."\n");
447 return $glexloc;
448 }
449 }
450 return undef;
451 }
452
453 sub glexec_to_nagios($$$) {
454 my $rc=shift;
455 my $signo=shift;
456 my $dt=shift;
457
458 if ($rc==0) {
459 nagstat->set_perfdata("${dt}s;$warning;$critical;0");
460 if ($dt>=$warning) {
461 nagstat->set_status(1,"gLExec took long time to succeed");
462 return 0;
463 } else {
464 nagstat->set_status(0,"Success");
465 return 1;
466 }
467 } elsif ($rc==126) {
468 nagstat->set_status(1,"executable $payload can't be executed ($rc)");
469 } elsif ($rc==201) {
470 nagstat->set_status(2,"client error ($rc)");
471 } elsif ($rc==202) {
472 nagstat->set_status(2,"system error ($rc)");
473 } elsif ($rc==203) {
474 nagstat->set_status(2,"authorization error ($rc)");
475 } elsif ($rc==204) {
476 nagstat->set_status(2,"exit code overlap error ($rc)");
477 } elsif ($signo!=0) {
478 nagstat->set_status(2,"exit due to signal $signo ($rc)");
479 } else {
480 nagstat->set_status(2,
481 "executable $payload failed with non-zero exit code ($rc)");
482 }
483 return 1;
484 }
485
486 # Find gLExec command, payload command (when relative), runs it and returns
487 # status
488 sub run_glexec() {
489 my $glexec;
490 my $exitcode;
491 my $signo;
492 my $t1;
493 my $t2;
494
495 # Make sure to have starttime
496 $t1=time();
497
498 # Set alarm before looking for gLExec to prevent NFS timeouts
499 alarm($critical);
500
501 # Find glexec command
502 if (!defined ($glexec=find_glexec)) {
503 nagstat->set_status(2,"glexec command not found");
504 return 1;
505 }
506
507 # Check proxies
508 if (!defined $X509_USER_PROXY) {
509 nagstat->set_status(3,"\$X509_USER_PROXY is unset.");
510 return 1;
511 }
512 if (! -e $X509_USER_PROXY || ! -s $X509_USER_PROXY) {
513 nagstat->set_status(3,
514 "\$X509_USER_PROXY does not point to a nonempty file.");
515 return 1;
516 }
517 if (! -e $GLEXEC_CLIENT_CERT || ! -s $GLEXEC_CLIENT_CERT) {
518 nagstat->set_status(3,
519 "\$GLEXEC_CLIENT_CERT does not point to a nonempty file.");
520 return 1;
521 }
522
523 # Find full path for payload if it's relative
524 if ($payload !~ /^\/.*/) {
525 (my $name=$payload) =~ s/ .*//;
526 my $fullname;
527 for my $dir (@PATH) {
528 logger->log_func(3,"Looking for ".$name." in ".$dir."\n");
529 $fullname=$dir."/".$name;
530 if (-x $fullname) {
531 ($payload=$payload) =~ s/^$name/$fullname/;
532 logger->log_func(2,"Payload set to ".$payload."\n");
533 last;
534 }
535 }
536 }
537
538 # Run actual probe in child process
539 if (probeipc->run_probe("$glexec $payload",\$exitcode,\$signo)!=0) {
540 return 1;
541 }
542
543 # Probe exited: find exit status
544 $t2=time();
545 my $dt=int(($t2-$t1)*1000+0.5)/1000;
546 return glexec_to_nagios($exitcode,$signo,$dt);
547 }
548
549 # Parse commandline options
550 getopts();
551
552 # Initialize logger and set loglevel
553 logger->new($verbose);
554
555 # Initialize objects
556 nagstat->new();
557
558 # Initialize signal handling
559 probeipc->new(\&nagios_exit,$timeout,$critical);
560
561 # run actual gLExec probe
562 run_glexec();
563
564 # Dump nagios status, log and exit
565 nagios_exit();
566

Properties

Name Value
svn:executable *

grid.support@nikhef.nl
ViewVC Help
Powered by ViewVC 1.1.28