1 |
#!/usr/bin/perl |
2 |
|
3 |
use strict; |
4 |
|
5 |
# Nagios event handler to put a MOM offline when cvmfs gives an error |
6 |
|
7 |
if ( @ARGV < 4 ) { |
8 |
print STDERR "Missing required arguments (4 required in total)\n"; |
9 |
exit 1; |
10 |
} |
11 |
|
12 |
my $debug = $ENV{DEBUG} || 0; |
13 |
|
14 |
my $pbsnodes = "/usr/bin/pbsnodes"; |
15 |
my $node_offline_comment = "Nagios event handler: cvmfs error"; |
16 |
my $hostname = $ENV{HOSTNAME} || `hostname -f`; |
17 |
my $verb = 1; |
18 |
|
19 |
chomp $hostname; |
20 |
|
21 |
# |
22 |
# main function |
23 |
# |
24 |
if ( $ARGV[0] eq "OK" ) { |
25 |
# service is fine, may put node back online |
26 |
if ( $ARGV[1] eq "HARD" ) { |
27 |
&check_node_state; |
28 |
} |
29 |
} |
30 |
elsif ( $ARGV[0] eq "CRITICAL" ) { |
31 |
# there can be many different warnings, some are worse than other |
32 |
if ( $ARGV[1] eq "SOFT" ) { |
33 |
if ( $ARGV[2] > 1 ) { |
34 |
&change_node_state( "offline" ); |
35 |
} |
36 |
} |
37 |
elsif ( $ARGV[1] eq "HARD" ) { |
38 |
&change_node_state( "offline" ); |
39 |
} |
40 |
} |
41 |
elsif ( $ARGV[0] eq "WARNING" ) { |
42 |
# there may be a warning concerning cache errors |
43 |
# try to clear it while in a SOFT state |
44 |
if ( $ARGV[1] eq "SOFT" || ( $ARGV[1] eq "HARD" and $ARGV[2] < 4 ) ) { |
45 |
&reset_error_counters( $ARGV[3] ); |
46 |
} |
47 |
|
48 |
} |
49 |
|
50 |
exit 0; |
51 |
|
52 |
|
53 |
|
54 |
sub reset_error_counters { |
55 |
my $repo = $_[0]; |
56 |
my $cmd = "cvmfs-talk -i $repo reset error counters > /dev/null 2>&1"; |
57 |
system( $cmd ); |
58 |
} |
59 |
|
60 |
|
61 |
|
62 |
# verify the file system state |
63 |
sub change_node_state { |
64 |
my $reqstate = $_[0]; |
65 |
|
66 |
my $newcmt; |
67 |
my $option; |
68 |
my $curcmt; |
69 |
open PBS, "$pbsnodes -a $hostname |" or die "Cannot read from $pbsnodes\n"; |
70 |
while ( <PBS> ) { |
71 |
if ( /^\s*note/ ) { |
72 |
( $curcmt = $_ ) =~ s!^\s*note\ =\ !!; |
73 |
chomp $curcmt; |
74 |
$debug and print STDERR "current comment = $curcmt\n"; |
75 |
last; |
76 |
} |
77 |
} |
78 |
close PBS; |
79 |
|
80 |
if ( $reqstate eq "offline" ) { |
81 |
if ( $curcmt =~ /$node_offline_comment/ ) { |
82 |
$debug and print STDERR "keep comment\n"; |
83 |
$newcmt = $curcmt; |
84 |
} |
85 |
else { |
86 |
$debug and print STDERR "add cvmfs comment\n"; |
87 |
$newcmt = "$curcmt,$node_offline_comment"; |
88 |
} |
89 |
$option = "-o"; |
90 |
} |
91 |
elsif ( $reqstate eq "clear" ) { |
92 |
( $newcmt = $curcmt ) =~ s!$node_offline_comment!!; |
93 |
} |
94 |
else { |
95 |
print STDERR "Invalid state $reqstate\n"; |
96 |
return; |
97 |
} |
98 |
|
99 |
# remove leading or trailing commmas |
100 |
$newcmt =~ s!^,+!!; |
101 |
$newcmt =~ s!,+$!!; |
102 |
chomp $newcmt; |
103 |
|
104 |
if ( $newcmt eq "" ) { |
105 |
$option = "-c"; # only clear node state if no other comments |
106 |
} |
107 |
system( "$pbsnodes $option $hostname -N '$newcmt'" ); |
108 |
} |
109 |
|
110 |
# verify that node is offline and its note contains the offline comment |
111 |
sub check_node_state { |
112 |
my $nodeline = `$pbsnodes -l $hostname -n`; |
113 |
if ( $nodeline ne "" ) { |
114 |
if ( $nodeline =~ /offline.*$node_offline_comment/ ) { |
115 |
&change_node_state( "clear" ); |
116 |
} |
117 |
else { |
118 |
$debug and print STDERR "Will not clear node, could not find comment $node_offline_comment\n"; |
119 |
} |
120 |
} |
121 |
} |