LSI Monitoring script Print

  • 122

https://github.com/racker/rackspace-monitoring-agent-plugins-contrib/blob/master/megaraid.sh
#!/bin/bash
  # Rackspace Cloud Monitoring Plug-In
  # megaraid plugin to query SMART status of drives attached to LSI megaraid or
  # DELL PERC {3,700} raid controllers.
  #
  # ----------------------------------------------------------------------------
  # "THE BEER-WARE LICENSE" (Revision 42):
  # <simon.vetter@runbox.com> wrote this file. As long as you retain this notice
  # you can do whatever you want with this stuff. If we meet some day, and you
  # think this stuff is worth it, you can buy me a beer in return
  # ----------------------------------------------------------------------------
  #
  # Usage:
  # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins
  #
  # This plugin returns 5 metrics:
  # - failed : the number of drives in failed state,
  # - prefail : the number of drives in prefail state,
  # - unknown : the number of drives for which the smart state could not
  # be determined,
  # - ok : the number of drives in OK state,
  # - report : a string reporting the drive id, vendor, serial number
  # as well as the smart state for non-ok drives.
  # e.g. /dev/bus/0 -d megaraid,4 SEAGATE 6SL28GNF FAILED \
  # ^controller & drive ids ^vendor ^serial# ^state
  # ( HARDWARE IMPENDING FAILURE GENERAL HARD DRIVE FAILURE [asc=5d, ascq=10] )
  # ^SMART health status for this drive
  #
  # The following is an example 'criteria' for a Rackspace Monitoring Alarm:
  #
  # if (metric['failed'] != 0) {
  # return new AlarmStatus(CRITICAL, '#{failed} failed drive(s): #{report}');
  # }
  #
  # if (metric['prefail'] != 0) {
  # return new AlarmStatus(WARNING, '#{prefail} prefail drive(s): #{report}');
  # }
  #
  # if (metric['unknown'] != 0) {
  # return new AlarmStatus(WARNING, '#{unknown} unknown drive(s): #{report}');
  # }
  #
  # return new AlarmStatus(OK, '#{ok} drive(s) OK');
  #
  # Things to keep in mind:
  # - this plugin needs a fairly recent version of smartmontools (tested OK with 6.2)
  # (apt-get install smartmontools) but does NOT need megacli.
  # - on big and loaded arrays, the plugin can take more than 10s (default agent plugin
  # timeout) to complete. Some disks are slower than others, not surprisingly.
  # - as of now, this plugin only checks individual drives and not the status of the
  # array as seen by the controller. I'd add it, but it seems hard to extract without
  # megacli which I'm trying to stay away from. If you know of a way, please let me
  # know.
  #
  #
  SMARTCTL=$(which smartctl)
   
  OK_CNT=0
  PREFAIL_CNT=0
  FAILED_CNT=0
  UNKNOWN_CNT=0
  REPORT=""
   
  # discover all drives
  DEVLIST=$(${SMARTCTL} --scan 2>/dev/null)
  if [ $? -ne 0 ]
  then
  echo status failed to perform drive discovery
  exit 1
  fi
   
  while read DEV
  do
  STAT=$(${SMARTCTL} ${DEV} --info --health 2>/dev/null)
  STATRC=$?
  SHS=$(echo "${STAT}" | grep -i 'smart health status:' | cut -d':' -f2)
  DRIVE_ID=$(echo "${STAT}" | grep -iE '(vendor:|serial number:)' | cut -d':' -f2 | xargs)
   
  # Bit 3: SMART status check returned "DISK FAILING".
  if [ $((${STATRC} & (2**3))) -ne 0 ]; then
  ((FAILED_CNT++))
  REPORT="${REPORT} ${DEV} ${DRIVE_ID} FAILED (${SHS} ) "
  # Bit 4: We found prefail Attributes <= threshold.
  # Bit 5: SMART status check returned "DISK OK" but we found that some (usage or prefail)
  # attributes have been <= threshold at some time in the past.
  elif [ $((${STATRC} & (2**4) | ${STATRC} & (2**5))) -ne 0 ]; then
  ((PREFAIL_CNT++))
  REPORT="${REPORT} ${DEV} ${DRIVE_ID} PREFAIL (${SHS} ) "
  # Anything else (drive open failed, smart command failed, etc.) maps to unknown to me
  elif [ ${STATRC} -ne 0 ]; then
  ((UNKNOWN_CNT++))
  REPORT="${REPORT} ${DEV} ${DRIVE_ID} UNKNOWN (${SHS} ) "
  else
  ((OK_CNT++))
  fi
  # only care for /dev/bus devices. /dev/sd* are logical disks
  # and do not respond to any SMART command.
  done < <(echo "${DEVLIST}" | grep /dev/bus/ | cut -d'#' -f1)
   
  if [ "z${REPORT}" == "z" ]; then
  REPORT="all drives OK"
  fi
   
  echo "status smart status retrieved"
  echo "metric failed uint32 ${FAILED_CNT}"
  echo "metric prefail uint32 ${PREFAIL_CNT}"
  echo "metric unknown uint32 ${UNKNOWN_CNT}"
  echo "metric ok uint32 ${OK_CNT}"
  echo "metric report string ${REPORT}"
   
  exit 0







Was this answer helpful?

« Back

["\r\n