summaryrefslogtreecommitdiff
path: root/nagios-check-raid.pl
blob: f971d664c98ea289913ef212221c25f409d62a92 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/perl -w
# ------------------------------------------------------------------------------
# File Name:            chech_raid.pl
# Author:               Thomas Nilsen - Norway
# Date:                 14/06/2003
# Version:              0.1
# Description:          This script will check to see if any software raid
#                       devices are down.
# Email:                thomas.nilsen@doc-s.co.uk
# WWW:                  www.doc-s.co.uk
# ------------------------------------------------------------------------------
# Copyright 2003 (c) Thomas Nilsen
# Credits go to Ethan Galstad for coding Nagios
# License GPL
# ------------------------------------------------------------------------------
# Date          Author          Reason
# ----          ------          ------
# 2008-03-31    Peter Palfrader Return warning on running resync
# 2007-11-07    Peter Palfrader Return unknown if /proc/mdstat does not exist
# 05/10/2004    Peter Palfrader Make it work without that 'use util (vars)'
# 14/06/2003    TN              Initial Release
#                               - Format of mdstat assumed to be "2 line" per
#                                 device with [??] on the second line.
# ------------------------------------------------------------------------------

use strict;
use warnings;
use Getopt::Long;;
use vars qw($opt_V $opt_h $opt_t $opt_F $PROGNAME);
use lib '/usr/local/nagios/libexec/';
my $TIMEOUT=15;
my %ERRORS = ( OK => 0, WARNING => 1, CRITICAL => 2, UNKNOWN => -1 );


$PROGNAME="check_raid";

sub print_help ();
sub print_usage ();

$ENV{'PATH'}='';
$ENV{'BASH_ENV'}='';
$ENV{'ENV'}='';
my ( $line, $stat, $state ,@device, $msg, $status, $timeout);

$stat="/proc/mdstat";

#Option checking
Getopt::Long::Configure('bundling');
$status = GetOptions(
                "V"   => \$opt_V, "version"    => \$opt_V,
                "h"   => \$opt_h, "help"       => \$opt_h,
		"F"   => \$opt_F, "filename"   => \$opt_F,
                "t"   => \$opt_t, "timeout"  => \$opt_t);
# Version
if ($opt_V) {
        print($PROGNAME,': $Revision: 0.1 $');
        exit $ERRORS{'OK'};
}
# Help 
if ($opt_h) {
        print_help();
        exit $ERRORS{'OK'};
}
# Filename supplied
if ($opt_F) {
	$opt_F = shift; 
	$stat = $1 if ($opt_F =~ /^(.*)$/);

	if ( ! -r $stat ) {
		print "Invalid mdstat file: $opt_F\n";
		exit $ERRORS{'UNKNOWN'};
	}
}

$timeout = $TIMEOUT;
($opt_t) && ($opt_t =~ /^([0-9]+)$/) && ($timeout = $1);

# Just in case of problems, let's not hang Nagios
$SIG{'ALRM'} = sub {
        print ("ERROR: No response (alarm)\n");
        exit $ERRORS{'UNKNOWN'};
};
alarm($timeout);

# Start checking the file...
open (FH, $stat) or print("UNKNOWN: Cannot open $stat: $!\n"), exit $ERRORS{'UNKNOWN'};
$state = $ERRORS{'OK'};
$msg ="";

my @resyncing = ();
my $device = '';

# Now check the mdstat file..
while (<FH>) {
	$line = $_;
	if ($line =~ /^(md\S*) /) {
		$device = $1;
	} elsif( $line =~ / \[_|_\]|U_|_U /) {
		$state = $ERRORS{'CRITICAL'};
		$msg = $msg . $device . ": - ";
	}
	elsif ( $line =~ / resync /) {
		#       [==>..................]  resync = 10.3% (15216320/146994624) finish=2153.2min speed=1018K/sec
		my ($percent) = ($line =~ m# resync = ([0-9.]+%)#);
		my ($finish)  = ($line =~ m# finish=([0-9.]+min)#);
		my ($speed)   = ($line =~ m# speed=([0-9.]+K/sec)#);
		push @resyncing, "$device ($percent done, finish in $finish at $speed)";
	}
}
close (FH);

if ( $state == $ERRORS{'CRITICAL'} ) { 
	print "CRITICAL - Device(s) $msg have failed\n"; 
} elsif ( scalar @resyncing > 0 ) {
	print "WARNING: Resyncing: ".(join "; ", @resyncing)."\n";
	$state = $ERRORS{'WARNING'};
} elsif ( $state == $ERRORS{'OK'} )
	 { print "OK - All devices are online\n"; }
exit $state;


sub print_usage () {
        print "Usage: $PROGNAME -t <timeout> -F <filename>\n";
}

sub print_help () {
        print_revision($PROGNAME,'$Revision: 0.1 $');
        print "Copyright (c) 2003 Thomas Nilsen/Karl DeBisschop\n";
        print "\n";
        print_usage();
        print "Checks the mdstat file for errors on any configured software raid.\n
-t ( --timeout=INTEGER)
	Seconds before script times out (default: 10)\n
-F ( --filename=FILE)
	Full path and name to mdstat file (usually '/proc/mdstat') \n\n";
}