1 |
#!/usr/bin/perl |
2 |
# |
3 |
# Copyright (C) Nikhef 2011 |
4 |
# |
5 |
# Licensed under the Apache License, Version 2.0 (the "License"); |
6 |
# you may not use this file except in compliance with the License. |
7 |
# You may obtain a copy of the License at |
8 |
# |
9 |
# http://www.apache.org/licenses/LICENSE-2.0 |
10 |
# |
11 |
# Unless required by applicable law or agreed to in writing, software |
12 |
# distributed under the License is distributed on an "AS IS" BASIS, |
13 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 |
# See the License for the specific language governing permissions and |
15 |
# limitations under the License. |
16 |
# |
17 |
# Author: |
18 |
# Mischa Sall\'e <msalle@nikhef.nl> |
19 |
# NIKHEF Amsterdam, the Netherlands |
20 |
# |
21 |
######################################################################## |
22 |
# |
23 |
# Nagios probe to test functioning of gLExec |
24 |
# |
25 |
# Nagios state can be one of the following: |
26 |
# - Missing glexec command: CRITICAL |
27 |
# - input proxies empty: UNKNOWN |
28 |
# - short timeout exceeded: WARNING |
29 |
# - timeout exceeded: CRITICAL |
30 |
# - gLExec exit codes: |
31 |
# 0 glexec succeeded: OK |
32 |
# 201 Client error: CRITICAL |
33 |
# 202 Internal error: CRITICAL |
34 |
# 203 Auth error: CRITICAL |
35 |
# 204 Overlap: CRITICAL |
36 |
# 126 execve failed: WARNING |
37 |
# 128+n signal: WARNING |
38 |
# !=0 rc of payload: WARNING |
39 |
# |
40 |
######################################################################## |
41 |
|
42 |
# DEFAULTS |
43 |
my $probeversion=0.2; |
44 |
|
45 |
# Note the following defaults can be overridden using cmdline options |
46 |
my $deftimeout=10; # Overall timeout for probe |
47 |
my $defcritical=8; # When to send SIGTERM |
48 |
my $defwarning=5; # When to warn about slow running |
49 |
my $defpayload="id -a"; # Which payload to run |
50 |
|
51 |
######################################################################## |
52 |
# Logging package |
53 |
# keeps internal log trace which can be dumped with dump_log |
54 |
######################################################################## |
55 |
package logger; |
56 |
use strict; |
57 |
use warnings; |
58 |
{ |
59 |
my $loglevel; |
60 |
my @logstring; |
61 |
|
62 |
# Constructor |
63 |
sub new { |
64 |
my $classname=shift; |
65 |
my $self={}; bless $self; |
66 |
my $level=shift; |
67 |
if (defined $level) { |
68 |
$self->set_loglevel($level); |
69 |
} else { |
70 |
$loglevel=0; |
71 |
} |
72 |
return $self; |
73 |
} |
74 |
|
75 |
# Sets loglevel |
76 |
sub set_loglevel($) { |
77 |
my $self=shift; |
78 |
my $level=shift; |
79 |
$loglevel=$level; |
80 |
} |
81 |
|
82 |
# Logging function: log_func(priority, "logstring\n"); |
83 |
sub log_func($@) { |
84 |
my $self=shift; |
85 |
my $prio=shift; |
86 |
return if ($prio > $loglevel); |
87 |
for my $line (@_) { |
88 |
push @logstring,$line; |
89 |
} |
90 |
} |
91 |
|
92 |
# Dumps log |
93 |
sub get_log(@) { |
94 |
my $self=shift; |
95 |
foreach my $myentry ( @logstring ) { |
96 |
print $myentry; |
97 |
} |
98 |
} |
99 |
} |
100 |
|
101 |
######################################################################## |
102 |
# Nagios status printing package |
103 |
# Can set and dump nagios status output |
104 |
######################################################################## |
105 |
package nagstat; |
106 |
{ |
107 |
my $code; |
108 |
my $summary; |
109 |
my $perfdata; |
110 |
my @stat; |
111 |
|
112 |
# Constructor |
113 |
sub new() { |
114 |
my $classname=shift; |
115 |
my $self={}; bless $self; |
116 |
$code=3; # Default status unknown |
117 |
$summary=undef; |
118 |
$perfdata=undef; |
119 |
@stat=("OK","WARNING","CRITICAL","UNKNOWN"); |
120 |
return $self; |
121 |
} |
122 |
|
123 |
# Set nagios code (0-3) plus summary |
124 |
sub set_status($$) { |
125 |
my $self=shift; |
126 |
if (!defined $summary) { |
127 |
$code=shift; |
128 |
$summary=shift; |
129 |
} |
130 |
} |
131 |
|
132 |
# Set internal performance data |
133 |
sub set_perfdata($) { |
134 |
my $self=shift; |
135 |
$perfdata=shift; |
136 |
} |
137 |
|
138 |
# Printout nagios status, summary and optionally performance data |
139 |
# return value is code (0-3) |
140 |
sub get_status { |
141 |
if (!defined $summary) { |
142 |
$summary="unknown status"; |
143 |
} |
144 |
if (defined $perfdata) { |
145 |
print $stat[$code].": ".$summary."|".$perfdata."\n"; |
146 |
} else { |
147 |
print $stat[$code].": ".$summary."\n"; |
148 |
} |
149 |
return $code; |
150 |
} |
151 |
} |
152 |
|
153 |
######################################################################## |
154 |
# Inter process communication package for nagios probes |
155 |
# Starts alarm handler when receiving alarm which checks status of |
156 |
# probe, and terminates or kills it. |
157 |
######################################################################## |
158 |
package probeipc; |
159 |
use POSIX ":sys_wait_h"; |
160 |
{ |
161 |
my $pid; |
162 |
my $wpid; |
163 |
my $status; |
164 |
my $numsent; |
165 |
my $killtime; |
166 |
my $termtime; |
167 |
my $exitfunc; |
168 |
|
169 |
# Constructor: new(exitfunc,[kill time], [term time]) |
170 |
sub new() { |
171 |
my $classname=shift; |
172 |
my $self={}; bless $self; |
173 |
my $exitfunc=shift or die ($classname."::new() needs exitfunc arg\n"); |
174 |
my $killtime=(shift or 10); # probe default timeout is 10 |
175 |
my $termtime=(shift or $killtime); |
176 |
$self->set_exitfunc($exitfunc); |
177 |
$self->set_killtime($killtime); |
178 |
$self->set_termtime($termtime); |
179 |
$pid=-1; |
180 |
$wpid=0; |
181 |
$status=0; |
182 |
$numsent=0; |
183 |
$SIG{'ALRM'} = \&alarm_handler; |
184 |
$SIG{'INT'} = \&int_handler; |
185 |
$SIG{'TERM'} = \&int_handler; |
186 |
return $self; |
187 |
} |
188 |
|
189 |
# Sets time after which to send SIGKILL |
190 |
sub set_killtime($) { |
191 |
my $self=shift; |
192 |
$killtime=shift; |
193 |
} |
194 |
|
195 |
# Sets time after which to send SIGTERM |
196 |
sub set_termtime($) { |
197 |
my $self=shift; |
198 |
$termtime=shift; |
199 |
} |
200 |
|
201 |
# Sets function to call when exiting after sending a SIGKILL |
202 |
sub set_exitfunc($) { |
203 |
my $self=shift; |
204 |
$exitfunc=shift; |
205 |
} |
206 |
|
207 |
# Signal handler for SIGALRM |
208 |
sub alarm_handler() { |
209 |
my ($sig) = @_; |
210 |
my $rc; |
211 |
if ($pid<0) { # No pid, nothing to do |
212 |
logger->log_func(2,"Payload hasn't started yet\n"); |
213 |
nagstat->set_status(2,"probe killtime exceeded"); |
214 |
&$exitfunc(); |
215 |
} |
216 |
# Either is or was a process: test status |
217 |
logger->log_func(2,"subprocess is/was running with pid ".$pid."\n"); |
218 |
if ($wpid!=0) { # process has finished (>0)or waitpid(-1) failed |
219 |
return; |
220 |
} |
221 |
# Get status |
222 |
$wpid=waitpid($pid,WNOHANG); |
223 |
$status=$?; |
224 |
if ($wpid==0) { # Still running |
225 |
if ($killtime<=$termtime || $numsent==1) { |
226 |
logger->log_func(2,"Sending SIGKILL to ".$pid."\n"); |
227 |
kill(9,$pid); |
228 |
nagstat->set_status(2,"probe timeout exceeded"); |
229 |
&$exitfunc(); |
230 |
} |
231 |
logger->log_func(2,"Sending SIGTERM to ".$pid."\n"); |
232 |
kill(15,$pid); |
233 |
$numsent=1; |
234 |
alarm($killtime-$termtime); |
235 |
nagstat->set_status(2,"probe critical time exceeded"); |
236 |
} |
237 |
return; |
238 |
} |
239 |
|
240 |
# Signal handler for SIGINT and SIGTERM |
241 |
sub int_handler() { |
242 |
my ($sig)=@_; |
243 |
|
244 |
logger->log_func(2,"Caught SIG$sig\n"); |
245 |
nagstat->set_status(2,"probe interrupted with SIG$sig"); |
246 |
if ($pid<0) { # No pid, nothing to do |
247 |
logger->log_func(2,"Payload hasn't started yet\n"); |
248 |
&$exitfunc(); |
249 |
} |
250 |
# Either is or was a process: test status |
251 |
if ($wpid!=0) { # process has finished (>0)or waitpid(-1) failed |
252 |
logger->log_func(2,"Subprocess with pid ".$pid." already finished\n"); |
253 |
&$exitfunc(); |
254 |
} |
255 |
# Get status |
256 |
$wpid=waitpid($pid,WNOHANG); |
257 |
$status=$?; |
258 |
if ($wpid==0) { # Still running: send SIGTERM |
259 |
logger->log_func(2,"Sending SIGTERM to ".$pid."\n"); |
260 |
kill(15,$pid); |
261 |
} |
262 |
&$exitfunc(); |
263 |
} |
264 |
|
265 |
# Wait for specified pid and return exitcode and signal number. |
266 |
sub wait_probe() { |
267 |
my $self=shift; |
268 |
my $rc; |
269 |
my $signo; |
270 |
|
271 |
$wpid=waitpid($pid,0) if ($wpid<=0); |
272 |
if ($wpid==$pid) { # probe exited here |
273 |
alarm(0); |
274 |
$rc=$? >> 8; |
275 |
$signo=$? & 127; |
276 |
} elsif ($wpid==-1) { # probe exited in sighandler |
277 |
$rc=$status >> 8; |
278 |
$signo=$status & 127; |
279 |
} |
280 |
return ($rc,$signo); |
281 |
} |
282 |
|
283 |
# Starts specified command: run_probe("command",\$rc,\$signo) and returns 0 |
284 |
# on normal exit, or 1 when command cannot be started. |
285 |
# Output of command is stored as log info. Nagios status is set when |
286 |
# applicable. |
287 |
sub run_probe($$$) { |
288 |
my $self=shift; |
289 |
my $command=shift; |
290 |
my $rc=shift; |
291 |
my $signo=shift; |
292 |
|
293 |
# Start command |
294 |
$pid = open(FOO, $command." 2>&1|"); |
295 |
if (!defined($pid)) { |
296 |
alarm(0); |
297 |
nagstat->set_status(2,"Failed to run $command"); |
298 |
return 1; |
299 |
} |
300 |
while (my $line=<FOO>) { |
301 |
logger->log_func(3,$line); |
302 |
} |
303 |
($$rc,$$signo)=$self->wait_probe(); |
304 |
return 0; |
305 |
} |
306 |
} |
307 |
|
308 |
######################################################################## |
309 |
# Running main probe package |
310 |
######################################################################## |
311 |
package main; |
312 |
use strict; |
313 |
use warnings; |
314 |
|
315 |
use Env qw(@PATH GLEXEC_LOCATION GLITE_LOCATION X509_USER_PROXY GLEXEC_CLIENT_CERT); |
316 |
use Getopt::Long qw(:config no_ignore_case bundling); |
317 |
|
318 |
my $timeout; # Total maximum runtime for probe |
319 |
my $critical; # Time after which to kill gLExec |
320 |
my $warning; # Time after which to warn about slow gLExec |
321 |
my $payload; # Payload plus arguments: relative uses $PATH to find |
322 |
my $verbose; # Verbosity level |
323 |
|
324 |
# Prints usage output |
325 |
sub usage() { |
326 |
(my $name = $0) =~ s/.*\///; |
327 |
print <<EOHELP; |
328 |
Usage: $name [options] |
329 |
|
330 |
Options: |
331 |
-t|--timeout <timeout> maximum runtime for probe, default: $deftimeout sec |
332 |
-w|--warning <timeout> runtime after which to warn, default: $defwarning sec |
333 |
-c|--critical <timeout> runtime after which to probe is to be killed, |
334 |
default: $defcritical sec |
335 |
-x|--x509-user-proxy <file> set X509_USER_PROXY to given file |
336 |
-g|--glexec-client-cert <file> set GLEXEC_CLIENT_CERT to given file |
337 |
default: value of variable X509_USER_PROXY |
338 |
-e|--execute <cmd> command to execute by gLExec |
339 |
default: \"$defpayload\" |
340 |
-v|--verbose be more verbose, more -v means more verbosity |
341 |
-V|--version print version |
342 |
--help show this helptext |
343 |
-h show short usage information |
344 |
EOHELP |
345 |
exit 0; |
346 |
} |
347 |
|
348 |
# Prints short usage output (oneline) |
349 |
sub shortusage() { |
350 |
(my $name = $0) =~ s/.*\///; |
351 |
print <<EOHELP; |
352 |
Usage: $name [options] |
353 |
EOHELP |
354 |
} |
355 |
|
356 |
# Prints probe version |
357 |
sub version() { |
358 |
(my $name = $0) =~ s/.*\///; |
359 |
print <<EOHELP; |
360 |
$name version: $probeversion |
361 |
EOHELP |
362 |
} |
363 |
|
364 |
# Parses command line options and sets global variables |
365 |
sub getopts() { |
366 |
my $x509proxy; |
367 |
my $clientcert; |
368 |
my $version; |
369 |
my $help; |
370 |
my $shorthelp; |
371 |
|
372 |
$timeout=$deftimeout; |
373 |
$critical=$defcritical; |
374 |
$warning=$defwarning; |
375 |
$payload=$defpayload; |
376 |
GetOptions( |
377 |
"t|timeout=i" => \$timeout, |
378 |
"c|critical=i" => \$critical, |
379 |
"w|warning=i" => \$warning, |
380 |
"x|x509-user-proxy=s" => \$X509_USER_PROXY, |
381 |
"g|glexec-client-cert=s" => \$GLEXEC_CLIENT_CERT, |
382 |
"e|execute=s" => \$payload, |
383 |
"v|verbose+" => \$verbose, |
384 |
"help+" => \$help, |
385 |
"h+" => \$shorthelp, |
386 |
"V|version+" => \$version, |
387 |
"H|host", |
388 |
"p|port" |
389 |
) or &usage and exit(1); |
390 |
|
391 |
$help and &usage and exit(0); |
392 |
$shorthelp and &shortusage and exit(0); |
393 |
$version and &version and exit(0); |
394 |
if (!defined $GLEXEC_CLIENT_CERT) { |
395 |
$GLEXEC_CLIENT_CERT=$X509_USER_PROXY; |
396 |
} |
397 |
} |
398 |
|
399 |
# Exit function: prints nagios status and dumps log |
400 |
sub nagios_exit() { |
401 |
my $rc=nagstat->get_status(); |
402 |
|
403 |
# Logging object |
404 |
logger->get_log(); |
405 |
|
406 |
exit $rc; |
407 |
} |
408 |
|
409 |
# Finds gLExec in path and pre-specified directories |
410 |
sub find_glexec { |
411 |
my $self=shift; |
412 |
my $glexloc; |
413 |
my $dir; |
414 |
my @DEFAULT_PATH=("/usr/local/sbin","/usr/sbin","/sbin", |
415 |
"/usr/local/bin","/usr/bin"); |
416 |
|
417 |
# Try GLEXEC_LOCATION |
418 |
if (defined $GLEXEC_LOCATION) { |
419 |
logger->log_func(3,"GLEXEC_LOCATION=".$GLEXEC_LOCATION."\n"); |
420 |
$glexloc=$GLEXEC_LOCATION."/sbin/glexec"; |
421 |
if (-x $glexloc) { |
422 |
logger->log_func(2,"gLExec found at ".$glexloc."\n"); |
423 |
return $glexloc; |
424 |
} |
425 |
logger->log_func(2,"gLExec NOT found at \$GLEXEC_LOCATION\n"); |
426 |
} |
427 |
|
428 |
# Try GLITE_LOCATION |
429 |
$GLITE_LOCATION="/opt/glite" if (!defined $GLITE_LOCATION); |
430 |
logger->log_func(3,"GLITE_LOCATION=".$GLITE_LOCATION."\n"); |
431 |
|
432 |
@PATH=(".") if (!$PATH[1]); |
433 |
|
434 |
for $dir (@PATH,$GLITE_LOCATION."/sbin",@DEFAULT_PATH) { |
435 |
logger->log_func(3,"Looking for glexec in ".$dir."\n"); |
436 |
$glexloc=$dir."/glexec"; |
437 |
if (-x $glexloc) { |
438 |
logger->log_func(2,"gLExec found at ".$glexloc."\n"); |
439 |
return $glexloc; |
440 |
} |
441 |
} |
442 |
return undef; |
443 |
} |
444 |
|
445 |
sub glexec_to_nagios($$$) { |
446 |
my $rc=shift; |
447 |
my $signo=shift; |
448 |
my $dt=shift; |
449 |
|
450 |
if ($rc==0) { |
451 |
nagstat->set_perfdata("${dt}s;$warning;$critical;0"); |
452 |
if ($dt>=$warning) { |
453 |
nagstat->set_status(1,"gLExec took long time to succeed"); |
454 |
return 0; |
455 |
} else { |
456 |
nagstat->set_status(0,"Success"); |
457 |
return 1; |
458 |
} |
459 |
} elsif ($rc==126) { |
460 |
nagstat->set_status(1,"executable $payload can't be executed ($rc)"); |
461 |
} elsif ($rc==201) { |
462 |
nagstat->set_status(2,"client error ($rc)"); |
463 |
} elsif ($rc==202) { |
464 |
nagstat->set_status(2,"system error ($rc)"); |
465 |
} elsif ($rc==203) { |
466 |
nagstat->set_status(2,"authorization error ($rc)"); |
467 |
} elsif ($rc==204) { |
468 |
nagstat->set_status(2,"exit code overlap error ($rc)"); |
469 |
} elsif ($signo!=0) { |
470 |
nagstat->set_status(2,"exit due to signal $signo ($rc)"); |
471 |
} else { |
472 |
nagstat->set_status(2, |
473 |
"executable $payload failed with non-zero exit code ($rc)"); |
474 |
} |
475 |
return 1; |
476 |
} |
477 |
|
478 |
# Find gLExec command, payload command (when relative), runs it and returns |
479 |
# status |
480 |
sub run_glexec() { |
481 |
my $glexec; |
482 |
my $exitcode; |
483 |
my $signo; |
484 |
my $t1; |
485 |
my $t2; |
486 |
|
487 |
# Make sure to have starttime |
488 |
$t1=time(); |
489 |
|
490 |
# Set alarm before looking for gLExec to prevent NFS timeouts |
491 |
alarm($critical); |
492 |
|
493 |
# Find glexec command |
494 |
if (!defined ($glexec=find_glexec)) { |
495 |
nagstat->set_status(2,"glexec command not found"); |
496 |
return 1; |
497 |
} |
498 |
|
499 |
# Check proxies |
500 |
if (!defined $X509_USER_PROXY) { |
501 |
nagstat->set_status(3,"\$X509_USER_PROXY is unset."); |
502 |
return 1; |
503 |
} |
504 |
if (! -e $X509_USER_PROXY || ! -s $X509_USER_PROXY) { |
505 |
nagstat->set_status(3, |
506 |
"\$X509_USER_PROXY does not point to a nonempty file."); |
507 |
return 1; |
508 |
} |
509 |
if (! -e $GLEXEC_CLIENT_CERT || ! -s $GLEXEC_CLIENT_CERT) { |
510 |
nagstat->set_status(3, |
511 |
"\$GLEXEC_CLIENT_CERT does not point to a nonempty file."); |
512 |
return 1; |
513 |
} |
514 |
|
515 |
# Find full path for payload if it's relative |
516 |
if ($payload !~ /^\/.*/) { |
517 |
(my $name=$payload) =~ s/ .*//; |
518 |
my $fullname; |
519 |
for my $dir (@PATH) { |
520 |
logger->log_func(3,"Looking for ".$name." in ".$dir."\n"); |
521 |
$fullname=$dir."/".$name; |
522 |
if (-x $fullname) { |
523 |
($payload=$payload) =~ s/^$name/$fullname/; |
524 |
logger->log_func(2,"Payload set to ".$payload."\n"); |
525 |
last; |
526 |
} |
527 |
} |
528 |
} |
529 |
|
530 |
# Run actual probe in child process |
531 |
if (probeipc->run_probe("$glexec $payload",\$exitcode,\$signo)!=0) { |
532 |
return 1; |
533 |
} |
534 |
|
535 |
# Probe exited: find exit status |
536 |
$t2=time(); |
537 |
return glexec_to_nagios($exitcode,$signo,$t2-$t1); |
538 |
} |
539 |
|
540 |
# Parse commandline options |
541 |
getopts(); |
542 |
|
543 |
# Initialize logger and set loglevel |
544 |
logger->new($verbose); |
545 |
|
546 |
# Initialize objects |
547 |
nagstat->new(); |
548 |
|
549 |
# Initialize signal handling |
550 |
probeipc->new(\&nagios_exit,$timeout,$critical); |
551 |
|
552 |
# run actual gLExec probe |
553 |
run_glexec(); |
554 |
|
555 |
# Dump nagios status, log and exit |
556 |
nagios_exit(); |
557 |
|