1 |
#!/usr/bin/perl |
2 |
# |
3 |
# Copyright (C) Nikhef 2011 |
4 |
# |
5 |
# Licensed under the Apache License, Version 2.0 (the "License"); |
6 |
# you may not use this file except in compliance with the License. |
7 |
# You may obtain a copy of the License at |
8 |
# |
9 |
# http://www.apache.org/licenses/LICENSE-2.0 |
10 |
# |
11 |
# Unless required by applicable law or agreed to in writing, software |
12 |
# distributed under the License is distributed on an "AS IS" BASIS, |
13 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 |
# See the License for the specific language governing permissions and |
15 |
# limitations under the License. |
16 |
# |
17 |
# Author: |
18 |
# Mischa Sall\'e <msalle@nikhef.nl> |
19 |
# NIKHEF Amsterdam, the Netherlands |
20 |
# |
21 |
######################################################################## |
22 |
# |
23 |
# Nagios probe to test functioning of gLExec |
24 |
# |
25 |
# Nagios state can be one of the following: |
26 |
# - Missing glexec command: CRITICAL |
27 |
# - input proxies empty: UNKNOWN |
28 |
# - short timeout exceeded: WARNING |
29 |
# - timeout exceeded: CRITICAL |
30 |
# - gLExec exit codes: |
31 |
# 0 glexec succeeded: OK |
32 |
# 201 Client error: CRITICAL |
33 |
# 202 Internal error: CRITICAL |
34 |
# 203 Auth error: CRITICAL |
35 |
# 204 Overlap: CRITICAL |
36 |
# 126 execve failed: WARNING |
37 |
# 128+n signal: WARNING |
38 |
# !=0 rc of payload: WARNING |
39 |
# |
40 |
######################################################################## |
41 |
|
42 |
# DEFAULTS |
43 |
my $probeversion=0.2; |
44 |
|
45 |
# Note the following defaults can be overridden using cmdline options |
46 |
my $deftimeout=10; # Overall timeout for probe |
47 |
my $defcritical=8; # When to send SIGTERM |
48 |
my $defwarning=5; # When to warn about slow running |
49 |
my $defpayload="id -a"; # Which payload to run |
50 |
|
51 |
######################################################################## |
52 |
# Logging package |
53 |
# keeps internal log trace which can be dumped with dump_log |
54 |
######################################################################## |
55 |
package logger; |
56 |
use strict; |
57 |
use warnings; |
58 |
{ |
59 |
my $loglevel; |
60 |
my @logstring; |
61 |
|
62 |
# Constructor |
63 |
sub new { |
64 |
my $classname=shift; |
65 |
my $self={}; bless $self; |
66 |
my $level=shift; |
67 |
if (defined $level) { |
68 |
$self->set_loglevel($level); |
69 |
} else { |
70 |
$loglevel=0; |
71 |
} |
72 |
return $self; |
73 |
} |
74 |
|
75 |
# Sets loglevel |
76 |
sub set_loglevel($) { |
77 |
my $self=shift; |
78 |
my $level=shift; |
79 |
$loglevel=$level; |
80 |
} |
81 |
|
82 |
# Logging function: log_func(priority, "logstring\n"); |
83 |
sub log_func($@) { |
84 |
my $self=shift; |
85 |
my $prio=shift; |
86 |
return if ($prio > $loglevel); |
87 |
for my $line (@_) { |
88 |
push @logstring,$line; |
89 |
} |
90 |
} |
91 |
|
92 |
# Dumps log |
93 |
sub get_log(@) { |
94 |
my $self=shift; |
95 |
foreach my $myentry ( @logstring ) { |
96 |
print $myentry; |
97 |
} |
98 |
} |
99 |
} |
100 |
|
101 |
######################################################################## |
102 |
# Nagios status printing package |
103 |
# Can set and dump nagios status output |
104 |
######################################################################## |
105 |
package nagstat; |
106 |
{ |
107 |
my $code; |
108 |
my $summary; |
109 |
my $perfdata; |
110 |
my @stat; |
111 |
|
112 |
# Constructor |
113 |
sub new() { |
114 |
my $classname=shift; |
115 |
my $self={}; bless $self; |
116 |
$code=3; # Default status unknown |
117 |
$summary=undef; |
118 |
$perfdata=undef; |
119 |
@stat=("OK","WARNING","CRITICAL","UNKNOWN"); |
120 |
return $self; |
121 |
} |
122 |
|
123 |
# Set nagios code (0-3) plus summary |
124 |
sub set_status($$) { |
125 |
my $self=shift; |
126 |
if (!defined $summary) { |
127 |
$code=shift; |
128 |
$summary=shift; |
129 |
} |
130 |
} |
131 |
|
132 |
# Set internal performance data |
133 |
sub set_perfdata($) { |
134 |
my $self=shift; |
135 |
$perfdata=shift; |
136 |
} |
137 |
|
138 |
# Printout nagios status, summary and optionally performance data |
139 |
# return value is code (0-3) |
140 |
sub get_status { |
141 |
if (!defined $summary) { |
142 |
$summary="unknown status"; |
143 |
} |
144 |
if (defined $perfdata) { |
145 |
print $stat[$code].": ".$summary."|".$perfdata."\n"; |
146 |
} else { |
147 |
print $stat[$code].": ".$summary."\n"; |
148 |
} |
149 |
return $code; |
150 |
} |
151 |
} |
152 |
|
153 |
######################################################################## |
154 |
# Inter process communication package for nagios probes |
155 |
# Starts alarm handler when receiving alarm which checks status of |
156 |
# probe, and terminates or kills it. |
157 |
######################################################################## |
158 |
package probeipc; |
159 |
use POSIX ":sys_wait_h"; |
160 |
use Time::HiRes qw(alarm); |
161 |
{ |
162 |
my $pid; |
163 |
my $wpid; |
164 |
my $status; |
165 |
my $numsent; |
166 |
my $killtime; |
167 |
my $termtime; |
168 |
my $exitfunc; |
169 |
|
170 |
# Constructor: new(exitfunc,[kill time], [term time]) |
171 |
sub new() { |
172 |
my $classname=shift; |
173 |
my $self={}; bless $self; |
174 |
my $exitfunc=shift or die ($classname."::new() needs exitfunc arg\n"); |
175 |
my $killtime=(shift or 10); # probe default timeout is 10 |
176 |
my $termtime=(shift or $killtime); |
177 |
$self->set_exitfunc($exitfunc); |
178 |
$self->set_killtime($killtime); |
179 |
$self->set_termtime($termtime); |
180 |
$pid=-1; |
181 |
$wpid=0; |
182 |
$status=0; |
183 |
$numsent=0; |
184 |
$SIG{'ALRM'} = \&alarm_handler; |
185 |
$SIG{'INT'} = \&int_handler; |
186 |
$SIG{'TERM'} = \&int_handler; |
187 |
return $self; |
188 |
} |
189 |
|
190 |
# Sets time after which to send SIGKILL |
191 |
sub set_killtime($) { |
192 |
my $self=shift; |
193 |
$killtime=shift; |
194 |
} |
195 |
|
196 |
# Sets time after which to send SIGTERM |
197 |
sub set_termtime($) { |
198 |
my $self=shift; |
199 |
$termtime=shift; |
200 |
} |
201 |
|
202 |
# Sets function to call when exiting after sending a SIGKILL |
203 |
sub set_exitfunc($) { |
204 |
my $self=shift; |
205 |
$exitfunc=shift; |
206 |
} |
207 |
|
208 |
# Signal handler for SIGALRM |
209 |
sub alarm_handler() { |
210 |
my ($sig) = @_; |
211 |
my $rc; |
212 |
if ($pid<0) { # No pid, nothing to do |
213 |
logger->log_func(2,"Payload hasn't started yet\n"); |
214 |
nagstat->set_status(2,"probe killtime exceeded"); |
215 |
&$exitfunc(); |
216 |
} |
217 |
# Either is or was a process: test status |
218 |
logger->log_func(2,"subprocess is/was running with pid ".$pid."\n"); |
219 |
if ($wpid!=0) { # process has finished (>0)or waitpid(-1) failed |
220 |
return; |
221 |
} |
222 |
# Get status |
223 |
$wpid=waitpid($pid,WNOHANG); |
224 |
$status=$?; |
225 |
if ($wpid==0) { # Still running |
226 |
if ($killtime<=$termtime || $numsent==1) { |
227 |
logger->log_func(2,"Sending SIGKILL to ".$pid."\n"); |
228 |
kill(9,$pid); |
229 |
nagstat->set_status(2,"probe timeout exceeded"); |
230 |
&$exitfunc(); |
231 |
} |
232 |
logger->log_func(2,"Sending SIGTERM to ".$pid."\n"); |
233 |
kill(15,$pid); |
234 |
$numsent=1; |
235 |
alarm($killtime-$termtime); |
236 |
nagstat->set_status(2,"probe critical time exceeded"); |
237 |
} |
238 |
return; |
239 |
} |
240 |
|
241 |
# Signal handler for SIGINT and SIGTERM |
242 |
sub int_handler() { |
243 |
my ($sig)=@_; |
244 |
|
245 |
logger->log_func(2,"Caught SIG$sig\n"); |
246 |
nagstat->set_status(2,"probe interrupted with SIG$sig"); |
247 |
if ($pid<0) { # No pid, nothing to do |
248 |
logger->log_func(2,"Payload hasn't started yet\n"); |
249 |
&$exitfunc(); |
250 |
} |
251 |
# Either is or was a process: test status |
252 |
if ($wpid!=0) { # process has finished (>0)or waitpid(-1) failed |
253 |
logger->log_func(2,"Subprocess with pid ".$pid." already finished\n"); |
254 |
&$exitfunc(); |
255 |
} |
256 |
# Get status |
257 |
$wpid=waitpid($pid,WNOHANG); |
258 |
$status=$?; |
259 |
if ($wpid==0) { # Still running: send SIGTERM |
260 |
logger->log_func(2,"Sending SIGTERM to ".$pid."\n"); |
261 |
kill(15,$pid); |
262 |
} |
263 |
&$exitfunc(); |
264 |
} |
265 |
|
266 |
# Wait for specified pid and return exitcode and signal number. |
267 |
sub wait_probe() { |
268 |
my $self=shift; |
269 |
my $rc; |
270 |
my $signo; |
271 |
|
272 |
$wpid=waitpid($pid,0) if ($wpid<=0); |
273 |
if ($wpid==$pid) { # probe exited here |
274 |
alarm(0); |
275 |
$rc=$? >> 8; |
276 |
$signo=$? & 127; |
277 |
} elsif ($wpid==-1) { # probe exited in sighandler |
278 |
$rc=$status >> 8; |
279 |
$signo=$status & 127; |
280 |
} |
281 |
return ($rc,$signo); |
282 |
} |
283 |
|
284 |
# Starts specified command: run_probe("command",\$rc,\$signo) and returns 0 |
285 |
# on normal exit, or 1 when command cannot be started. |
286 |
# Output of command is stored as log info. Nagios status is set when |
287 |
# applicable. |
288 |
sub run_probe($$$) { |
289 |
my $self=shift; |
290 |
my $command=shift; |
291 |
my $rc=shift; |
292 |
my $signo=shift; |
293 |
|
294 |
# Start command |
295 |
$pid = open(FOO, $command." 2>&1|"); |
296 |
if (!defined($pid)) { |
297 |
alarm(0); |
298 |
nagstat->set_status(2,"Failed to run $command"); |
299 |
return 1; |
300 |
} |
301 |
while (my $line=<FOO>) { |
302 |
logger->log_func(3,$line); |
303 |
} |
304 |
($$rc,$$signo)=$self->wait_probe(); |
305 |
return 0; |
306 |
} |
307 |
} |
308 |
|
309 |
######################################################################## |
310 |
# Running main probe package |
311 |
######################################################################## |
312 |
package main; |
313 |
use strict; |
314 |
use warnings; |
315 |
|
316 |
use Env qw(@PATH GLEXEC_LOCATION GLITE_LOCATION X509_USER_PROXY GLEXEC_CLIENT_CERT); |
317 |
use Getopt::Long qw(:config no_ignore_case bundling); |
318 |
use Time::HiRes qw(time alarm); |
319 |
|
320 |
my $timeout; # Total maximum runtime for probe |
321 |
my $critical; # Time after which to kill gLExec |
322 |
my $warning; # Time after which to warn about slow gLExec |
323 |
my $payload; # Payload plus arguments: relative uses $PATH to find |
324 |
my $verbose; # Verbosity level |
325 |
|
326 |
# Prints usage output |
327 |
sub usage() { |
328 |
(my $name = $0) =~ s/.*\///; |
329 |
print <<EOHELP; |
330 |
Usage: $name [options] |
331 |
|
332 |
Options: |
333 |
-t|--timeout <timeout> maximum runtime for probe, default: $deftimeout sec |
334 |
-w|--warning <timeout> runtime after which to warn, default: $defwarning sec |
335 |
-c|--critical <timeout> runtime after which to probe is to be killed, |
336 |
default: $defcritical sec |
337 |
-x|--x509-user-proxy <file> set X509_USER_PROXY to given file |
338 |
-g|--glexec-client-cert <file> set GLEXEC_CLIENT_CERT to given file |
339 |
default: value of variable X509_USER_PROXY |
340 |
-e|--execute <cmd> command to execute by gLExec |
341 |
default: \"$defpayload\" |
342 |
-v|--verbose be more verbose, more -v means more verbosity |
343 |
-V|--version print version |
344 |
--help show this helptext |
345 |
-h show short usage information |
346 |
EOHELP |
347 |
exit 0; |
348 |
} |
349 |
|
350 |
# Prints short usage output (oneline) |
351 |
sub shortusage() { |
352 |
(my $name = $0) =~ s/.*\///; |
353 |
print <<EOHELP; |
354 |
Usage: $name [options] |
355 |
EOHELP |
356 |
} |
357 |
|
358 |
# Prints probe version |
359 |
sub version() { |
360 |
(my $name = $0) =~ s/.*\///; |
361 |
print <<EOHELP; |
362 |
$name version: $probeversion |
363 |
EOHELP |
364 |
} |
365 |
|
366 |
# Parses command line options and sets global variables |
367 |
sub getopts() { |
368 |
my $x509proxy; |
369 |
my $clientcert; |
370 |
my $version; |
371 |
my $help; |
372 |
my $shorthelp; |
373 |
|
374 |
$timeout=$deftimeout; |
375 |
$critical=$defcritical; |
376 |
$warning=$defwarning; |
377 |
$payload=$defpayload; |
378 |
GetOptions( |
379 |
"t|timeout=f" => \$timeout, |
380 |
"c|critical=f" => \$critical, |
381 |
"w|warning=f" => \$warning, |
382 |
"x|x509-user-proxy=s" => \$X509_USER_PROXY, |
383 |
"g|glexec-client-cert=s" => \$GLEXEC_CLIENT_CERT, |
384 |
"e|execute=s" => \$payload, |
385 |
"v|verbose+" => \$verbose, |
386 |
"help+" => \$help, |
387 |
"h+" => \$shorthelp, |
388 |
"V|version+" => \$version, |
389 |
"H|host", |
390 |
"p|port", |
391 |
"u|url" |
392 |
) or &usage and exit(1); |
393 |
|
394 |
$help and &usage and exit(0); |
395 |
$shorthelp and &shortusage and exit(0); |
396 |
$version and &version and exit(0); |
397 |
if (!defined $GLEXEC_CLIENT_CERT) { |
398 |
$GLEXEC_CLIENT_CERT=$X509_USER_PROXY; |
399 |
} |
400 |
$timeout=0 if ($timeout<0); |
401 |
$critical=0 if ($critical<0); |
402 |
$warning=0 if ($warning<0); |
403 |
$critical=$timeout if ($timeout<$critical); |
404 |
} |
405 |
|
406 |
# Exit function: prints nagios status and dumps log |
407 |
sub nagios_exit() { |
408 |
my $rc=nagstat->get_status(); |
409 |
|
410 |
# Logging object |
411 |
logger->get_log(); |
412 |
|
413 |
exit $rc; |
414 |
} |
415 |
|
416 |
# Finds gLExec in path and pre-specified directories |
417 |
sub find_glexec { |
418 |
my $self=shift; |
419 |
my $glexloc; |
420 |
my $dir; |
421 |
my @DEFAULT_PATH=("/usr/local/sbin","/usr/sbin","/sbin", |
422 |
"/usr/local/bin","/usr/bin"); |
423 |
|
424 |
# Try GLEXEC_LOCATION |
425 |
if (defined $GLEXEC_LOCATION) { |
426 |
logger->log_func(3,"GLEXEC_LOCATION=".$GLEXEC_LOCATION."\n"); |
427 |
$glexloc=$GLEXEC_LOCATION."/sbin/glexec"; |
428 |
if (-x $glexloc) { |
429 |
logger->log_func(2,"gLExec found at ".$glexloc."\n"); |
430 |
return $glexloc; |
431 |
} |
432 |
logger->log_func(2,"gLExec NOT found at \$GLEXEC_LOCATION\n"); |
433 |
} |
434 |
|
435 |
# Try GLITE_LOCATION |
436 |
$GLITE_LOCATION="/opt/glite" if (!defined $GLITE_LOCATION); |
437 |
logger->log_func(3,"GLITE_LOCATION=".$GLITE_LOCATION."\n"); |
438 |
|
439 |
@PATH=(".") if (!$PATH[1]); |
440 |
|
441 |
for $dir (@PATH,$GLITE_LOCATION."/sbin",@DEFAULT_PATH) { |
442 |
logger->log_func(3,"Looking for glexec in ".$dir."\n"); |
443 |
$glexloc=$dir."/glexec"; |
444 |
if (-x $glexloc) { |
445 |
logger->log_func(2,"gLExec found at ".$glexloc."\n"); |
446 |
return $glexloc; |
447 |
} |
448 |
} |
449 |
return undef; |
450 |
} |
451 |
|
452 |
sub glexec_to_nagios($$$) { |
453 |
my $rc=shift; |
454 |
my $signo=shift; |
455 |
my $dt=shift; |
456 |
|
457 |
if ($rc==0) { |
458 |
nagstat->set_perfdata("${dt}s;$warning;$critical;0"); |
459 |
if ($dt>=$warning) { |
460 |
nagstat->set_status(1,"gLExec took long time to succeed"); |
461 |
return 0; |
462 |
} else { |
463 |
nagstat->set_status(0,"Success"); |
464 |
return 1; |
465 |
} |
466 |
} elsif ($rc==126) { |
467 |
nagstat->set_status(1,"executable $payload can't be executed ($rc)"); |
468 |
} elsif ($rc==201) { |
469 |
nagstat->set_status(2,"client error ($rc)"); |
470 |
} elsif ($rc==202) { |
471 |
nagstat->set_status(2,"system error ($rc)"); |
472 |
} elsif ($rc==203) { |
473 |
nagstat->set_status(2,"authorization error ($rc)"); |
474 |
} elsif ($rc==204) { |
475 |
nagstat->set_status(2,"exit code overlap error ($rc)"); |
476 |
} elsif ($signo!=0) { |
477 |
nagstat->set_status(2,"exit due to signal $signo ($rc)"); |
478 |
} else { |
479 |
nagstat->set_status(2, |
480 |
"executable $payload failed with non-zero exit code ($rc)"); |
481 |
} |
482 |
return 1; |
483 |
} |
484 |
|
485 |
# Find gLExec command, payload command (when relative), runs it and returns |
486 |
# status |
487 |
sub run_glexec() { |
488 |
my $glexec; |
489 |
my $exitcode; |
490 |
my $signo; |
491 |
my $t1; |
492 |
my $t2; |
493 |
|
494 |
# Make sure to have starttime |
495 |
$t1=time(); |
496 |
|
497 |
# Set alarm before looking for gLExec to prevent NFS timeouts |
498 |
alarm($critical); |
499 |
|
500 |
# Find glexec command |
501 |
if (!defined ($glexec=find_glexec)) { |
502 |
nagstat->set_status(2,"glexec command not found"); |
503 |
return 1; |
504 |
} |
505 |
|
506 |
# Check proxies |
507 |
if (!defined $X509_USER_PROXY) { |
508 |
nagstat->set_status(3,"\$X509_USER_PROXY is unset."); |
509 |
return 1; |
510 |
} |
511 |
if (! -e $X509_USER_PROXY || ! -s $X509_USER_PROXY) { |
512 |
nagstat->set_status(3, |
513 |
"\$X509_USER_PROXY does not point to a nonempty file."); |
514 |
return 1; |
515 |
} |
516 |
if (! -e $GLEXEC_CLIENT_CERT || ! -s $GLEXEC_CLIENT_CERT) { |
517 |
nagstat->set_status(3, |
518 |
"\$GLEXEC_CLIENT_CERT does not point to a nonempty file."); |
519 |
return 1; |
520 |
} |
521 |
|
522 |
# Find full path for payload if it's relative |
523 |
if ($payload !~ /^\/.*/) { |
524 |
(my $name=$payload) =~ s/ .*//; |
525 |
my $fullname; |
526 |
for my $dir (@PATH) { |
527 |
logger->log_func(3,"Looking for ".$name." in ".$dir."\n"); |
528 |
$fullname=$dir."/".$name; |
529 |
if (-x $fullname) { |
530 |
($payload=$payload) =~ s/^$name/$fullname/; |
531 |
logger->log_func(2,"Payload set to ".$payload."\n"); |
532 |
last; |
533 |
} |
534 |
} |
535 |
} |
536 |
|
537 |
# Run actual probe in child process |
538 |
if (probeipc->run_probe("$glexec $payload",\$exitcode,\$signo)!=0) { |
539 |
return 1; |
540 |
} |
541 |
|
542 |
# Probe exited: find exit status |
543 |
$t2=time(); |
544 |
my $dt=int(($t2-$t1)*1000+0.5)/1000; |
545 |
return glexec_to_nagios($exitcode,$signo,$dt); |
546 |
} |
547 |
|
548 |
# Parse commandline options |
549 |
getopts(); |
550 |
|
551 |
# Initialize logger and set loglevel |
552 |
logger->new($verbose); |
553 |
|
554 |
# Initialize objects |
555 |
nagstat->new(); |
556 |
|
557 |
# Initialize signal handling |
558 |
probeipc->new(\&nagios_exit,$timeout,$critical); |
559 |
|
560 |
# run actual gLExec probe |
561 |
run_glexec(); |
562 |
|
563 |
# Dump nagios status, log and exit |
564 |
nagios_exit(); |
565 |
|