Nagios監控SSD健康的腳本

生產中使用了SSD磁盤,使用smartctl -a /dev/sdb可以查看到該磁盤的各項值,我們可以根據這些Value和Worst等於或小於thresh時候就要注意了,下面上腳本:

 

  1. #!/usr/bin/perl 
  2.  
  3. =head 
  4. check ssd infomation 
  5.  
  6. usage: add "nagios  ALL=(root)      NOPASSWD: /usr/sbin/smartctl" to /etc/sudoers file; 
  7. =cut 
  8.  
  9. use strict; 
  10. use warnings; 
  11. use Data::Dumper; 
  12. use Getopt::Long; 
  13.  
  14. my ($result,$device,$h,$debug); 
  15. my ($start_time,$use_time) = (time,0.00);; 
  16. my %ssd_attribute = (   5   =>  "Reallocated_Sector_Ct"
  17.                         184 =>  "End_to_End_Error_Detection_Count"
  18.                         225 =>  "Raw_Read_Error_Rate"
  19.                         232 =>  "Available_Reserver_Space"
  20.                         233 =>  "Media_Wearout_Indicator"
  21.                         9   =>  "Power_On_Hours"
  22.                     ); 
  23.  
  24. $result = GetOptions ("device=s" => \$device, 
  25.                       "debug"    => \$debug); 
  26.  
  27. $device ||= 'sdb'
  28. $debug ||=0; 
  29.  
  30. $h->{$device}->{output} = ""
  31. $h->{$device}->{perfdata} = ""
  32. $h->{$device}->{status} = 0; 
  33. $h->{$device}->{total_info} = `sudo /usr/sbin/smartctl -a /dev/$device 2>&1`; 
  34. if ( $h->{$device}->{total_info} =~ m{===\s+START\s+OF\s+INFORMATION\s+SECTION\s+===(.*)===\s+START\s+OF\s+READ\s+SMART\s+DATA\s+SECTION\s+===\s+SMART\s+overall-health\s+self-assessment\s+test\s+result:\s+(\w+)[\d\D]+Vendor\s+Specific\s+SMART\s+Attributes\s+with\s+Thresholds([\d\D]+)SMART\s+Error\s+Log\s+Version:\s+(\d+)}is) { 
  35.     $h->{$device}->{info_section} = $1; 
  36.     $h->{$device}->{smart_test_result} = $2; 
  37.     $h->{$device}->{healt_result} = $3; 
  38.     print "************************************** get $device healt info sta **************************************\n" if $debug; 
  39.     foreach my $line (split /\n/,$h->{$device}->{healt_result}) { 
  40.         #if ($line =~ m{(\d+)\s+([^\s]+)\s+([\dx]+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\w+)\s+(\w+)\s+([^\s]+)\s+(\d+)}i) { 
  41.         if ($line =~ m{(\d+)\s+([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+([^\s]+)}i) { 
  42.             print "ID:$1\tATTRIBUTE_NAME:$2\tFLAG:$3\tVALUE:$4\tWORST:$5\tTHRESH:$6\tTYPE:$7\tUPDATED:$8\tWHEN_FAILED:$9\tRAW_VALUE:$10\n" if $debug; 
  43.             $h->{$device}->{healt}->{$1}->{id} = $1; 
  44.             $h->{$device}->{healt}->{$1}->{attribute_name} = $2; 
  45.             $h->{$device}->{healt}->{$1}->{flag} = $3; 
  46.             $h->{$device}->{healt}->{$1}->{value} = $4; 
  47.             $h->{$device}->{healt}->{$1}->{worst} = $5; 
  48.             $h->{$device}->{healt}->{$1}->{thresh} = $6; 
  49.             $h->{$device}->{healt}->{$1}->{type} = $7; 
  50.             $h->{$device}->{healt}->{$1}->{updated} = $8; 
  51.             $h->{$device}->{healt}->{$1}->{when_failed} = $9; 
  52.             $h->{$device}->{healt}->{$1}->{raw_value} = $10; 
  53.         } 
  54.     } 
  55.     print "************************************** get $device healt info end **************************************\n" if $debug; 
  56.     $h->{$device}->{smart_error_log_version} = $4; 
  57.     $h->{$device}->{match} = 1; 
  58. else { 
  59.     $h->{$device}->{match} = 0; 
  60.  
  61. print "runging..... `sudo /usr/sbin/smartctl -a /dev/$device 2>&1`\n" if $debug; 
  62. print "\n\n-------------------------------- Dumper \$h sta --------------------------------\n" if $debug; 
  63. print Dumper $h if $debug; 
  64. print "-------------------------------- Dumper \$h end --------------------------------\n\n" if $debug; 
  65.  
  66. $use_time = sprintf("%0.2f",time - $start_time); 
  67. if ( (exists $h->{$device}->{match} && $h->{$device}->{match} == 0) ) { 
  68.     print "CRITICAL - smartctl get $device total info fail|status=1 time=$use_time\n"
  69.     exit (2); 
  70. } elsif ( ! exists $h->{$device}->{healt} ) { 
  71.     print "WARNING - smartctl get $device healt info fail|status=1 time=$use_time\n"
  72.     exit (1); 
  73. else { 
  74.     while ( (my ($id,$id_hash)) = (each %{$h->{$device}->{healt}} ) ) { 
  75.         if ( !exists $ssd_attribute{$id}) { 
  76.             print "not exists \$ssd_attribute{\$id},now next\n" if $debug; 
  77.             next
  78.         } 
  79.         print "----------------------------- loop \$h->{\$device}->{healt} hash -----------------------------\n" if $debug; 
  80.         print $h->{$device}->{healt}->{$id}->{worst} . "\t" if $debug; 
  81.         print $h->{$device}->{healt}->{$id}->{value} . "\t" if $debug; 
  82.         print $h->{$device}->{healt}->{$id}->{thresh} . "\n" if $debug; 
  83.         if ($h->{$device}->{healt}->{$id}->{value} <= $h->{$device}->{healt}->{$id}->{thresh}){ 
  84.             $h->{$device}->{output} .= "CRITICAL - $device " if ($h->{$device}->{output} eq ""); 
  85.             $h->{$device}->{output} .= "id:$id attribute_name:" . $ssd_attribute{$id} . " value:" . $h->{$device}->{healt}->{$id}->{value} . " "
  86.             $h->{$device}->{status} = 2; 
  87.             print 'value <= thersh' . "\n" if $debug; 
  88.         } elsif ($h->{$device}->{healt}->{$id}->{worst} <= $h->{$device}->{healt}->{$id}->{thresh}) { 
  89.             $h->{$device}->{output} .= "WARNING - $device " if ($h->{$device}->{output} eq ""); 
  90.             $h->{$device}->{output} .= "id:$id attribute_name:" . $ssd_attribute{$id} . " value:" . $h->{$device}->{healt}->{$id}->{value} . " "
  91.             $h->{$device}->{status} = 1; 
  92.             print 'value <= thersh' . "\n" if $debug; 
  93.         } 
  94.     } 
  95.  
  96.  
  97. if ($h->{$device}->{status} == 2) { 
  98.     print $h->{$device}->{output} . "|status=" . $h->{$device}->{status} . " time=$use_time\n"
  99.     exit ($h->{$device}->{status}); 
  100. } elsif ($h->{$device}->{status} == 1) { 
  101.     print $h->{$device}->{output} . "|status=" . $h->{$device}->{status} . " time=$use_time\n"
  102.     exit ($h->{$device}->{status}); 
  103. else { 
  104.     print "OK - $device SSD attribute healt" .  "|status=" . $h->{$device}->{status} . " time=$use_time\n"
  105.     exit ($h->{$device}->{status}); 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章