/[pdpsoft]/trunk/nagios/glexec/check_glexec
ViewVC logotype

Contents of /trunk/nagios/glexec/check_glexec

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2457 - (show annotations) (download)
Tue Dec 6 10:19:13 2011 UTC (10 years, 5 months ago) by msalle
File size: 14639 byte(s)
Add (unused) -u/--url option to glexec probe, add EES probe.

1 #!/usr/bin/perl
2 #
3 # Copyright (C) Nikhef 2011
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 #
17 # Author:
18 # Mischa Sall\'e <msalle@nikhef.nl>
19 # NIKHEF Amsterdam, the Netherlands
20 #
21 ########################################################################
22 #
23 # Nagios probe to test functioning of gLExec
24 #
25 # Nagios state can be one of the following:
26 # - Missing glexec command: CRITICAL
27 # - input proxies empty: UNKNOWN
28 # - short timeout exceeded: WARNING
29 # - timeout exceeded: CRITICAL
30 # - gLExec exit codes:
31 # 0 glexec succeeded: OK
32 # 201 Client error: CRITICAL
33 # 202 Internal error: CRITICAL
34 # 203 Auth error: CRITICAL
35 # 204 Overlap: CRITICAL
36 # 126 execve failed: WARNING
37 # 128+n signal: WARNING
38 # !=0 rc of payload: WARNING
39 #
40 ########################################################################
41
42 # DEFAULTS
43 my $probeversion=0.2;
44
45 # Note the following defaults can be overridden using cmdline options
46 my $deftimeout=10; # Overall timeout for probe
47 my $defcritical=8; # When to send SIGTERM
48 my $defwarning=5; # When to warn about slow running
49 my $defpayload="id -a"; # Which payload to run
50
51 ########################################################################
52 # Logging package
53 # keeps internal log trace which can be dumped with dump_log
54 ########################################################################
55 package logger;
56 use strict;
57 use warnings;
58 {
59 my $loglevel;
60 my @logstring;
61
62 # Constructor
63 sub new {
64 my $classname=shift;
65 my $self={}; bless $self;
66 my $level=shift;
67 if (defined $level) {
68 $self->set_loglevel($level);
69 } else {
70 $loglevel=0;
71 }
72 return $self;
73 }
74
75 # Sets loglevel
76 sub set_loglevel($) {
77 my $self=shift;
78 my $level=shift;
79 $loglevel=$level;
80 }
81
82 # Logging function: log_func(priority, "logstring\n");
83 sub log_func($@) {
84 my $self=shift;
85 my $prio=shift;
86 return if ($prio > $loglevel);
87 for my $line (@_) {
88 push @logstring,$line;
89 }
90 }
91
92 # Dumps log
93 sub get_log(@) {
94 my $self=shift;
95 foreach my $myentry ( @logstring ) {
96 print $myentry;
97 }
98 }
99 }
100
101 ########################################################################
102 # Nagios status printing package
103 # Can set and dump nagios status output
104 ########################################################################
105 package nagstat;
106 {
107 my $code;
108 my $summary;
109 my $perfdata;
110 my @stat;
111
112 # Constructor
113 sub new() {
114 my $classname=shift;
115 my $self={}; bless $self;
116 $code=3; # Default status unknown
117 $summary=undef;
118 $perfdata=undef;
119 @stat=("OK","WARNING","CRITICAL","UNKNOWN");
120 return $self;
121 }
122
123 # Set nagios code (0-3) plus summary
124 sub set_status($$) {
125 my $self=shift;
126 if (!defined $summary) {
127 $code=shift;
128 $summary=shift;
129 }
130 }
131
132 # Set internal performance data
133 sub set_perfdata($) {
134 my $self=shift;
135 $perfdata=shift;
136 }
137
138 # Printout nagios status, summary and optionally performance data
139 # return value is code (0-3)
140 sub get_status {
141 if (!defined $summary) {
142 $summary="unknown status";
143 }
144 if (defined $perfdata) {
145 print $stat[$code].": ".$summary."|".$perfdata."\n";
146 } else {
147 print $stat[$code].": ".$summary."\n";
148 }
149 return $code;
150 }
151 }
152
153 ########################################################################
154 # Inter process communication package for nagios probes
155 # Starts alarm handler when receiving alarm which checks status of
156 # probe, and terminates or kills it.
157 ########################################################################
158 package probeipc;
159 use POSIX ":sys_wait_h";
160 {
161 my $pid;
162 my $wpid;
163 my $status;
164 my $numsent;
165 my $killtime;
166 my $termtime;
167 my $exitfunc;
168
169 # Constructor: new(exitfunc,[kill time], [term time])
170 sub new() {
171 my $classname=shift;
172 my $self={}; bless $self;
173 my $exitfunc=shift or die ($classname."::new() needs exitfunc arg\n");
174 my $killtime=(shift or 10); # probe default timeout is 10
175 my $termtime=(shift or $killtime);
176 $self->set_exitfunc($exitfunc);
177 $self->set_killtime($killtime);
178 $self->set_termtime($termtime);
179 $pid=-1;
180 $wpid=0;
181 $status=0;
182 $numsent=0;
183 $SIG{'ALRM'} = \&alarm_handler;
184 $SIG{'INT'} = \&int_handler;
185 $SIG{'TERM'} = \&int_handler;
186 return $self;
187 }
188
189 # Sets time after which to send SIGKILL
190 sub set_killtime($) {
191 my $self=shift;
192 $killtime=shift;
193 }
194
195 # Sets time after which to send SIGTERM
196 sub set_termtime($) {
197 my $self=shift;
198 $termtime=shift;
199 }
200
201 # Sets function to call when exiting after sending a SIGKILL
202 sub set_exitfunc($) {
203 my $self=shift;
204 $exitfunc=shift;
205 }
206
207 # Signal handler for SIGALRM
208 sub alarm_handler() {
209 my ($sig) = @_;
210 my $rc;
211 if ($pid<0) { # No pid, nothing to do
212 logger->log_func(2,"Payload hasn't started yet\n");
213 nagstat->set_status(2,"probe killtime exceeded");
214 &$exitfunc();
215 }
216 # Either is or was a process: test status
217 logger->log_func(2,"subprocess is/was running with pid ".$pid."\n");
218 if ($wpid!=0) { # process has finished (>0)or waitpid(-1) failed
219 return;
220 }
221 # Get status
222 $wpid=waitpid($pid,WNOHANG);
223 $status=$?;
224 if ($wpid==0) { # Still running
225 if ($killtime<=$termtime || $numsent==1) {
226 logger->log_func(2,"Sending SIGKILL to ".$pid."\n");
227 kill(9,$pid);
228 nagstat->set_status(2,"probe timeout exceeded");
229 &$exitfunc();
230 }
231 logger->log_func(2,"Sending SIGTERM to ".$pid."\n");
232 kill(15,$pid);
233 $numsent=1;
234 alarm($killtime-$termtime);
235 nagstat->set_status(2,"probe critical time exceeded");
236 }
237 return;
238 }
239
240 # Signal handler for SIGINT and SIGTERM
241 sub int_handler() {
242 my ($sig)=@_;
243
244 logger->log_func(2,"Caught SIG$sig\n");
245 nagstat->set_status(2,"probe interrupted with SIG$sig");
246 if ($pid<0) { # No pid, nothing to do
247 logger->log_func(2,"Payload hasn't started yet\n");
248 &$exitfunc();
249 }
250 # Either is or was a process: test status
251 if ($wpid!=0) { # process has finished (>0)or waitpid(-1) failed
252 logger->log_func(2,"Subprocess with pid ".$pid." already finished\n");
253 &$exitfunc();
254 }
255 # Get status
256 $wpid=waitpid($pid,WNOHANG);
257 $status=$?;
258 if ($wpid==0) { # Still running: send SIGTERM
259 logger->log_func(2,"Sending SIGTERM to ".$pid."\n");
260 kill(15,$pid);
261 }
262 &$exitfunc();
263 }
264
265 # Wait for specified pid and return exitcode and signal number.
266 sub wait_probe() {
267 my $self=shift;
268 my $rc;
269 my $signo;
270
271 $wpid=waitpid($pid,0) if ($wpid<=0);
272 if ($wpid==$pid) { # probe exited here
273 alarm(0);
274 $rc=$? >> 8;
275 $signo=$? & 127;
276 } elsif ($wpid==-1) { # probe exited in sighandler
277 $rc=$status >> 8;
278 $signo=$status & 127;
279 }
280 return ($rc,$signo);
281 }
282
283 # Starts specified command: run_probe("command",\$rc,\$signo) and returns 0
284 # on normal exit, or 1 when command cannot be started.
285 # Output of command is stored as log info. Nagios status is set when
286 # applicable.
287 sub run_probe($$$) {
288 my $self=shift;
289 my $command=shift;
290 my $rc=shift;
291 my $signo=shift;
292
293 # Start command
294 $pid = open(FOO, $command." 2>&1|");
295 if (!defined($pid)) {
296 alarm(0);
297 nagstat->set_status(2,"Failed to run $command");
298 return 1;
299 }
300 while (my $line=<FOO>) {
301 logger->log_func(3,$line);
302 }
303 ($$rc,$$signo)=$self->wait_probe();
304 return 0;
305 }
306 }
307
308 ########################################################################
309 # Running main probe package
310 ########################################################################
311 package main;
312 use strict;
313 use warnings;
314
315 use Env qw(@PATH GLEXEC_LOCATION GLITE_LOCATION X509_USER_PROXY GLEXEC_CLIENT_CERT);
316 use Getopt::Long qw(:config no_ignore_case bundling);
317
318 my $timeout; # Total maximum runtime for probe
319 my $critical; # Time after which to kill gLExec
320 my $warning; # Time after which to warn about slow gLExec
321 my $payload; # Payload plus arguments: relative uses $PATH to find
322 my $verbose; # Verbosity level
323
324 # Prints usage output
325 sub usage() {
326 (my $name = $0) =~ s/.*\///;
327 print <<EOHELP;
328 Usage: $name [options]
329
330 Options:
331 -t|--timeout <timeout> maximum runtime for probe, default: $deftimeout sec
332 -w|--warning <timeout> runtime after which to warn, default: $defwarning sec
333 -c|--critical <timeout> runtime after which to probe is to be killed,
334 default: $defcritical sec
335 -x|--x509-user-proxy <file> set X509_USER_PROXY to given file
336 -g|--glexec-client-cert <file> set GLEXEC_CLIENT_CERT to given file
337 default: value of variable X509_USER_PROXY
338 -e|--execute <cmd> command to execute by gLExec
339 default: \"$defpayload\"
340 -v|--verbose be more verbose, more -v means more verbosity
341 -V|--version print version
342 --help show this helptext
343 -h show short usage information
344 EOHELP
345 exit 0;
346 }
347
348 # Prints short usage output (oneline)
349 sub shortusage() {
350 (my $name = $0) =~ s/.*\///;
351 print <<EOHELP;
352 Usage: $name [options]
353 EOHELP
354 }
355
356 # Prints probe version
357 sub version() {
358 (my $name = $0) =~ s/.*\///;
359 print <<EOHELP;
360 $name version: $probeversion
361 EOHELP
362 }
363
364 # Parses command line options and sets global variables
365 sub getopts() {
366 my $x509proxy;
367 my $clientcert;
368 my $version;
369 my $help;
370 my $shorthelp;
371
372 $timeout=$deftimeout;
373 $critical=$defcritical;
374 $warning=$defwarning;
375 $payload=$defpayload;
376 GetOptions(
377 "t|timeout=i" => \$timeout,
378 "c|critical=i" => \$critical,
379 "w|warning=i" => \$warning,
380 "x|x509-user-proxy=s" => \$X509_USER_PROXY,
381 "g|glexec-client-cert=s" => \$GLEXEC_CLIENT_CERT,
382 "e|execute=s" => \$payload,
383 "v|verbose+" => \$verbose,
384 "help+" => \$help,
385 "h+" => \$shorthelp,
386 "V|version+" => \$version,
387 "H|host",
388 "p|port",
389 "u|url"
390 ) or &usage and exit(1);
391
392 $help and &usage and exit(0);
393 $shorthelp and &shortusage and exit(0);
394 $version and &version and exit(0);
395 if (!defined $GLEXEC_CLIENT_CERT) {
396 $GLEXEC_CLIENT_CERT=$X509_USER_PROXY;
397 }
398 }
399
400 # Exit function: prints nagios status and dumps log
401 sub nagios_exit() {
402 my $rc=nagstat->get_status();
403
404 # Logging object
405 logger->get_log();
406
407 exit $rc;
408 }
409
410 # Finds gLExec in path and pre-specified directories
411 sub find_glexec {
412 my $self=shift;
413 my $glexloc;
414 my $dir;
415 my @DEFAULT_PATH=("/usr/local/sbin","/usr/sbin","/sbin",
416 "/usr/local/bin","/usr/bin");
417
418 # Try GLEXEC_LOCATION
419 if (defined $GLEXEC_LOCATION) {
420 logger->log_func(3,"GLEXEC_LOCATION=".$GLEXEC_LOCATION."\n");
421 $glexloc=$GLEXEC_LOCATION."/sbin/glexec";
422 if (-x $glexloc) {
423 logger->log_func(2,"gLExec found at ".$glexloc."\n");
424 return $glexloc;
425 }
426 logger->log_func(2,"gLExec NOT found at \$GLEXEC_LOCATION\n");
427 }
428
429 # Try GLITE_LOCATION
430 $GLITE_LOCATION="/opt/glite" if (!defined $GLITE_LOCATION);
431 logger->log_func(3,"GLITE_LOCATION=".$GLITE_LOCATION."\n");
432
433 @PATH=(".") if (!$PATH[1]);
434
435 for $dir (@PATH,$GLITE_LOCATION."/sbin",@DEFAULT_PATH) {
436 logger->log_func(3,"Looking for glexec in ".$dir."\n");
437 $glexloc=$dir."/glexec";
438 if (-x $glexloc) {
439 logger->log_func(2,"gLExec found at ".$glexloc."\n");
440 return $glexloc;
441 }
442 }
443 return undef;
444 }
445
446 sub glexec_to_nagios($$$) {
447 my $rc=shift;
448 my $signo=shift;
449 my $dt=shift;
450
451 if ($rc==0) {
452 nagstat->set_perfdata("${dt}s;$warning;$critical;0");
453 if ($dt>=$warning) {
454 nagstat->set_status(1,"gLExec took long time to succeed");
455 return 0;
456 } else {
457 nagstat->set_status(0,"Success");
458 return 1;
459 }
460 } elsif ($rc==126) {
461 nagstat->set_status(1,"executable $payload can't be executed ($rc)");
462 } elsif ($rc==201) {
463 nagstat->set_status(2,"client error ($rc)");
464 } elsif ($rc==202) {
465 nagstat->set_status(2,"system error ($rc)");
466 } elsif ($rc==203) {
467 nagstat->set_status(2,"authorization error ($rc)");
468 } elsif ($rc==204) {
469 nagstat->set_status(2,"exit code overlap error ($rc)");
470 } elsif ($signo!=0) {
471 nagstat->set_status(2,"exit due to signal $signo ($rc)");
472 } else {
473 nagstat->set_status(2,
474 "executable $payload failed with non-zero exit code ($rc)");
475 }
476 return 1;
477 }
478
479 # Find gLExec command, payload command (when relative), runs it and returns
480 # status
481 sub run_glexec() {
482 my $glexec;
483 my $exitcode;
484 my $signo;
485 my $t1;
486 my $t2;
487
488 # Make sure to have starttime
489 $t1=time();
490
491 # Set alarm before looking for gLExec to prevent NFS timeouts
492 alarm($critical);
493
494 # Find glexec command
495 if (!defined ($glexec=find_glexec)) {
496 nagstat->set_status(2,"glexec command not found");
497 return 1;
498 }
499
500 # Check proxies
501 if (!defined $X509_USER_PROXY) {
502 nagstat->set_status(3,"\$X509_USER_PROXY is unset.");
503 return 1;
504 }
505 if (! -e $X509_USER_PROXY || ! -s $X509_USER_PROXY) {
506 nagstat->set_status(3,
507 "\$X509_USER_PROXY does not point to a nonempty file.");
508 return 1;
509 }
510 if (! -e $GLEXEC_CLIENT_CERT || ! -s $GLEXEC_CLIENT_CERT) {
511 nagstat->set_status(3,
512 "\$GLEXEC_CLIENT_CERT does not point to a nonempty file.");
513 return 1;
514 }
515
516 # Find full path for payload if it's relative
517 if ($payload !~ /^\/.*/) {
518 (my $name=$payload) =~ s/ .*//;
519 my $fullname;
520 for my $dir (@PATH) {
521 logger->log_func(3,"Looking for ".$name." in ".$dir."\n");
522 $fullname=$dir."/".$name;
523 if (-x $fullname) {
524 ($payload=$payload) =~ s/^$name/$fullname/;
525 logger->log_func(2,"Payload set to ".$payload."\n");
526 last;
527 }
528 }
529 }
530
531 # Run actual probe in child process
532 if (probeipc->run_probe("$glexec $payload",\$exitcode,\$signo)!=0) {
533 return 1;
534 }
535
536 # Probe exited: find exit status
537 $t2=time();
538 return glexec_to_nagios($exitcode,$signo,$t2-$t1);
539 }
540
541 # Parse commandline options
542 getopts();
543
544 # Initialize logger and set loglevel
545 logger->new($verbose);
546
547 # Initialize objects
548 nagstat->new();
549
550 # Initialize signal handling
551 probeipc->new(\&nagios_exit,$timeout,$critical);
552
553 # run actual gLExec probe
554 run_glexec();
555
556 # Dump nagios status, log and exit
557 nagios_exit();
558

Properties

Name Value
svn:executable *

grid.support@nikhef.nl
ViewVC Help
Powered by ViewVC 1.1.28