| #!/bin/bash | |
| # Rackspace Cloud Monitoring Plug-In | |
| # megaraid plugin to query SMART status of drives attached to LSI megaraid or | |
| # DELL PERC {3,700} raid controllers. | |
| # | |
| # ---------------------------------------------------------------------------- | |
| # "THE BEER-WARE LICENSE" (Revision 42): | |
| # <simon.vetter@runbox.com> wrote this file. As long as you retain this notice | |
| # you can do whatever you want with this stuff. If we meet some day, and you | |
| # think this stuff is worth it, you can buy me a beer in return | |
| # ---------------------------------------------------------------------------- | |
| # | |
| # Usage: | |
| # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins | |
| # | |
| # This plugin returns 5 metrics: | |
| # - failed : the number of drives in failed state, | |
| # - prefail : the number of drives in prefail state, | |
| # - unknown : the number of drives for which the smart state could not | |
| # be determined, | |
| # - ok : the number of drives in OK state, | |
| # - report : a string reporting the drive id, vendor, serial number | |
| # as well as the smart state for non-ok drives. | |
| # e.g. /dev/bus/0 -d megaraid,4 SEAGATE 6SL28GNF FAILED \ | |
| # ^controller & drive ids ^vendor ^serial# ^state | |
| # ( HARDWARE IMPENDING FAILURE GENERAL HARD DRIVE FAILURE [asc=5d, ascq=10] ) | |
| # ^SMART health status for this drive | |
| # | |
| # The following is an example 'criteria' for a Rackspace Monitoring Alarm: | |
| # | |
| # if (metric['failed'] != 0) { | |
| # return new AlarmStatus(CRITICAL, '#{failed} failed drive(s): #{report}'); | |
| # } | |
| # | |
| # if (metric['prefail'] != 0) { | |
| # return new AlarmStatus(WARNING, '#{prefail} prefail drive(s): #{report}'); | |
| # } | |
| # | |
| # if (metric['unknown'] != 0) { | |
| # return new AlarmStatus(WARNING, '#{unknown} unknown drive(s): #{report}'); | |
| # } | |
| # | |
| # return new AlarmStatus(OK, '#{ok} drive(s) OK'); | |
| # | |
| # Things to keep in mind: | |
| # - this plugin needs a fairly recent version of smartmontools (tested OK with 6.2) | |
| # (apt-get install smartmontools) but does NOT need megacli. | |
| # - on big and loaded arrays, the plugin can take more than 10s (default agent plugin | |
| # timeout) to complete. Some disks are slower than others, not surprisingly. | |
| # - as of now, this plugin only checks individual drives and not the status of the | |
| # array as seen by the controller. I'd add it, but it seems hard to extract without | |
| # megacli which I'm trying to stay away from. If you know of a way, please let me | |
| # know. | |
| # | |
| # | |
| SMARTCTL=$(which smartctl) | |
| OK_CNT=0 | |
| PREFAIL_CNT=0 | |
| FAILED_CNT=0 | |
| UNKNOWN_CNT=0 | |
| REPORT="" | |
| # discover all drives | |
| DEVLIST=$(${SMARTCTL} --scan 2>/dev/null) | |
| if [ $? -ne 0 ] | |
| then | |
| echo status failed to perform drive discovery | |
| exit 1 | |
| fi | |
| while read DEV | |
| do | |
| STAT=$(${SMARTCTL} ${DEV} --info --health 2>/dev/null) | |
| STATRC=$? | |
| SHS=$(echo "${STAT}" | grep -i 'smart health status:' | cut -d':' -f2) | |
| DRIVE_ID=$(echo "${STAT}" | grep -iE '(vendor:|serial number:)' | cut -d':' -f2 | xargs) | |
| # Bit 3: SMART status check returned "DISK FAILING". | |
| if [ $((${STATRC} & (2**3))) -ne 0 ]; then | |
| ((FAILED_CNT++)) | |
| REPORT="${REPORT} ${DEV} ${DRIVE_ID} FAILED (${SHS} ) " | |
| # Bit 4: We found prefail Attributes <= threshold. | |
| # Bit 5: SMART status check returned "DISK OK" but we found that some (usage or prefail) | |
| # attributes have been <= threshold at some time in the past. | |
| elif [ $((${STATRC} & (2**4) | ${STATRC} & (2**5))) -ne 0 ]; then | |
| ((PREFAIL_CNT++)) | |
| REPORT="${REPORT} ${DEV} ${DRIVE_ID} PREFAIL (${SHS} ) " | |
| # Anything else (drive open failed, smart command failed, etc.) maps to unknown to me | |
| elif [ ${STATRC} -ne 0 ]; then | |
| ((UNKNOWN_CNT++)) | |
| REPORT="${REPORT} ${DEV} ${DRIVE_ID} UNKNOWN (${SHS} ) " | |
| else | |
| ((OK_CNT++)) | |
| fi | |
| # only care for /dev/bus devices. /dev/sd* are logical disks | |
| # and do not respond to any SMART command. | |
| done < <(echo "${DEVLIST}" | grep /dev/bus/ | cut -d'#' -f1) | |
| if [ "z${REPORT}" == "z" ]; then | |
| REPORT="all drives OK" | |
| fi | |
| echo "status smart status retrieved" | |
| echo "metric failed uint32 ${FAILED_CNT}" | |
| echo "metric prefail uint32 ${PREFAIL_CNT}" | |
| echo "metric unknown uint32 ${UNKNOWN_CNT}" | |
| echo "metric ok uint32 ${OK_CNT}" | |
| echo "metric report string ${REPORT}" | |
| exit 0 |