#!/usr/bin/env bash
#########################################################################
# Script:       check_zpools.sh
# Purpose:      Nagios plugin to monitor status of zfs pool
# Authors:      Aldo Fabi             First version (2006-09-01)
#               Vitaliy Gladkevitch   Forked (2013-02-04)
#               Claudio Kuenzler      Complete redo, perfdata, etc (2013-2014)
#               Per von Zweigbergk    Various fixes (2016-10-12)
#               @waoki                Trap zpool command errors (2022-03-01)
# Doc:          http://www.claudiokuenzler.com/monitoring-plugins/check_zpools.php
# History:
# 2006-09-01    Original first version
# 2006-10-04    Updated (no change history known)
# 2013-02-04    Forked and released
# 2013-05-08    Make plugin work on different OS, pepp up plugin
# 2013-05-09    Bugfix in exit code handling
# 2013-05-10    Removed old exit vars (not used anymore)
# 2013-05-21    Added performance data (percentage used)
# 2013-07-11    Bugfix in zpool health check
# 2014-02-10    Bugfix in threshold comparison
# 2014-03-11    Allow plugin to run without enforced thresholds
# 2016-10-12    Fixed incorrect shell quoting and typos
# 2022-03-01	  Merge PR #10, manually solve conflicts
# 2022-05-10    Added availability check option
#########################################################################
### Begin vars
STATE_OK=0 # define the exit code if status is OK
STATE_WARNING=1 # define the exit code if status is Warning
STATE_CRITICAL=2 # define the exit code if status is Critical
STATE_UNKNOWN=3 # define the exit code if status is Unknown
# Set path
PATH=$PATH:/usr/sbin:/sbin
export PATH
export LC_NUMERIC="en_US.UTF-8"
### End vars
#########################################################################
help="check_zpools.sh (c) 2006-2022 several authors\n
Usage: $0 -p (poolname|ALL) ([-w capacity_warnpercent] [-c capacity_critpercent] | [-x available_warnpercent] [-y available_critpercent])\n
Example capacity: $0 -p ALL -w 80 -c 90
Example available: $0 -p ALL -x 30 -y 20"
#########################################################################
# Check necessary commands are available
for cmd in zpool awk [
do
 if ! which "$cmd" 1>/dev/null
 then
 echo "UNKNOWN: ${cmd} does not exist, please check if command exists and PATH is correct"
 exit ${STATE_UNKNOWN}
 fi
done
#########################################################################
# Check for people who need help - we are nice ;-)
if [ "${1}" = "--help" ] || [ "${#}" = "0" ];
       then
       echo -e "${help}";
       exit ${STATE_UNKNOWN};
fi
#########################################################################
# Get user-given variables
while getopts "p:w:c:x:y:" Input;
do
       case ${Input} in
       p)      pool=${OPTARG};;
       w)      warn=${OPTARG};;
       c)      crit=${OPTARG};;
       y)      avail_crit=${OPTARG};;
       x)      avail_warn=${OPTARG};;
       *)      echo -e "$help"
               exit $STATE_UNKNOWN
               ;;
       esac
done

#########################################################################
# Did user obey to usage?
if [ -z "$pool" ]; then echo -e "$help"; exit ${STATE_UNKNOWN}; fi
#########################################################################
# Verify threshold sense - capacity
if [[ -n $warn ]] && [[ -z $crit ]]; then echo "Both warning and critical thresholds must be set"; exit $STATE_UNKNOWN; fi
if [[ -z $warn ]] && [[ -n $crit ]]; then echo "Both warning and critical thresholds must be set"; exit $STATE_UNKNOWN; fi
if [[ $warn -gt $crit ]]; then echo "Warning threshold cannot be greater than critical"; exit $STATE_UNKNOWN; fi
#########################################################################
# Verify threshold sense - capacity
if [[ -n $avail_warn ]] && [[ -z $avail_crit ]]; then echo "Both warning and critical thresholds must be set"; exit $STATE_UNKNOWN; fi
if [[ -z $avail_warn ]] && [[ -n $avail_crit ]]; then echo "Both warning and critical thresholds must be set"; exit $STATE_UNKNOWN; fi
if [[ $avail_warn -lt $avail_crit ]]; then echo "Warning threshold cannot be less than critical"; exit $STATE_UNKNOWN; fi
#########################################################################
# What needs to be checked?
## Check all pools
if [ "$pool" = "ALL" ]
then
  POOLS=($(zpool list -Ho name))
  if [ $? -ne 0 ]; then
    echo "UNKNOWN zpool query failed"; exit $STATE_UNKNOWN
  fi
  p=0
  for POOL in ${POOLS[*]}
  do
    CAPACITY=$(zpool list -Ho capacity "$POOL" | awk -F"%" '{print $1}')
    HEALTH=$(zpool list -Ho health "$POOL")
    AVAILABLE=$(zfs get -H -p available "$POOL" | awk '{print $3}')
    TOTAL_SIZE=$(zpool list -H -p -o size "$POOL")
    AVAILABLE_PERCENTAGE=$(printf %.1f $(echo "100/$TOTAL_SIZE*$AVAILABLE" | bc -l))
    if [ $? -ne 0 ]; then
      echo "UNKNOWN zpool query failed"; exit $STATE_UNKNOWN
    fi
    # Check with thresholds
    if [[ -n $warn ]] && [[ -n $crit ]]
    then
      if [[ $CAPACITY -ge $crit ]]
       then error[${p}]="POOL $POOL usage is CRITICAL (${CAPACITY}%)"; fcrit=1
      elif [[ $CAPACITY -ge $warn && $CAPACITY -lt $crit ]]
       then error[$p]="POOL $POOL usage is WARNING (${CAPACITY}%)"
      elif [ "$HEALTH" != "ONLINE" ]
       then error[${p}]="$POOL health is $HEALTH"; fcrit=1
      fi
    elif [[ -n $avail_warn ]] && [[ -n $avail_crit ]]
    then
      AVAILABLE_WARN=$(printf %.1f $(echo $avail_warn))
      AVAILABLE_CRIT=$(printf %.1f $(echo $avail_crit))
      if (( $(echo "$AVAILABLE_PERCENTAGE < $AVAILABLE_CRIT" | bc -l) ))
       then error[${p}]="POOL $POOL availability is CRITICAL (${AVAILABLE_PERCENTAGE}%)"; fcrit=1
      elif (( $(echo "$AVAILABLE_PERCENTAGE < $AVAILABLE_WARN" | bc -l) )) && (( $(echo "$AVAILABLE_PERCENTAGE > $AVAILABLE_CRIT" | bc -l) ))
       then error[$p]="POOL $POOL availability is WARNING (${AVAILABLE_PERCENTAGE}%)"
      elif [ "$HEALTH" != "ONLINE" ]
       then error[${p}]="$POOL health is $HEALTH"; fcrit=1
      fi
    # Check without thresholds
    else
      if [ "$HEALTH" != "ONLINE" ]
       then error[${p}]="$POOL health is $HEALTH"; fcrit=1
      fi
    fi
    perfdata[$p]="$POOL:CAP=${CAPACITY}%:AVAIL=${AVAILABLE_PERCENTAGE}% "
    let p++
  done

  if [[ ${#error[*]} -gt 0 ]]
  then
    if [[ $fcrit -eq 1 ]]; then exit_code=2; else exit_code=1; fi
    echo "ZFS POOL ALARM: ${error[*]}|${perfdata[*]}"; exit ${exit_code}
  else echo "ALL ZFS POOLS OK (${POOLS[*]})|${perfdata[*]}"; exit 0
  fi

## Check single pool
else
  CAPACITY=$(zpool list -Ho capacity "$pool" 2>&1 | awk -F"%" '{print $1}')
  if [[ -n $(echo "${CAPACITY}" | egrep -q 'no such pool$') ]]; then
    echo "zpool $pool does not exist"; exit $STATE_CRITICAL
  fi
  HEALTH=$(zpool list -Ho health "$pool")
  if [ $? -ne 0 ]; then
    echo "UNKNOWN zpool query failed"; exit $STATE_UNKNOWN
  fi

  AVAILABLE=$(zfs get -H -p available "$pool" | awk '{print $3}')
  TOTAL_SIZE=$(zpool list -H -p -o size "$pool")
  AVAILABLE_PERCENTAGE=$(printf %.1f $(echo "100/$TOTAL_SIZE*$AVAILABLE" | bc -l))

  if [[ -n $warn ]] && [[ -n $crit ]]
  then
    # Check with thresholds
    if [ "$HEALTH" != "ONLINE" ]; then echo "ZFS POOL $pool health is $HEALTH|$pool=${CAPACITY}%"; exit ${STATE_CRITICAL}
    elif [[ $CAPACITY -ge $crit ]]; then echo "ZFS POOL $pool usage is CRITICAL (${CAPACITY}%|$pool=${CAPACITY}%)"; exit ${STATE_CRITICAL}
    elif [[ $CAPACITY -ge $warn && $CAPACITY -lt $crit ]]; then echo "ZFS POOL $pool usage is WARNING (${CAPACITY}%)|$pool=${CAPACITY}%"; exit ${STATE_WARNING}
    else echo "ALL ZFS POOLS OK ($pool)|CAP=${CAPACITY}%:AVAIL=${AVAILABLE_PERCENTAGE}%"; exit ${STATE_OK}
    fi
  elif [[ -n $avail_warn ]] && [[ -n $avail_crit ]]
  then
    AVAILABLE_WARN=$(printf %.1f $(echo $avail_warn))
    AVAILABLE_CRIT=$(printf %.1f $(echo $avail_crit))
    if [ "$HEALTH" != "ONLINE" ]; then echo "ZFS POOL $pool health is $HEALTH|$pool=${AVAILABLE_PERCENTAGE}%"; exit ${STATE_CRITICAL}
    elif (( $(echo "$AVAILABLE_PERCENTAGE < $AVAILABLE_CRIT" | bc -l) )); then echo "ZFS POOL $pool availability is CRITICAL (${AVAILABLE_PERCENTAGE}%)"; exit ${STATE_CRITICAL}
    elif (( $(echo "$AVAILABLE_PERCENTAGE < $AVAILABLE_WARN" | bc -l) )) && (( $(echo "$AVAILABLE_PERCENTAGE > $AVAILABLE_CRIT" | bc -l) )); then echo "ZFS POOL $pool availability is WARNING (${AVAILABLE_PERCENTAGE}%)"; exit ${STATE_WARNING}
    else echo "ALL ZFS POOLS OK ($pool)|CAP=${CAPACITY}%:AVAIL=${AVAILABLE_PERCENTAGE}%"; exit ${STATE_OK}
    fi
  else
    # Check without thresholds
    if [ "$HEALTH" != "ONLINE" ]
    then echo "ZFS POOL $pool health is $HEALTH|CAP=${CAPACITY}%:AVAIL=${AVAILABLE_PERCENTAGE}%"; exit ${STATE_CRITICAL}
    else echo "ALL ZFS POOLS OK ($pool)|CAP=${CAPACITY}%:AVAIL=${AVAILABLE_PERCENTAGE}%"; exit ${STATE_OK}
    fi
  fi

fi

echo "UNKNOWN - Should never reach this part"
exit ${STATE_UNKNOWN}
