File: //usr/lib64/nagios/plugins/vz7_check_a2_backups-check.sh
#!/bin/bash
#
# Nag monitoring script for vz7 nodes
# To monitor vm backups
# Ref SYSENG-1670, SYSENG-1822, SYSENG-3159, SYSENG-3288, SYSENG-3818
#
now=$(date +%s)
fivedaysago=$(($now-432000))
tendaysago=$(($now-864000))
critical=0
warning=0
threshold=18000
suspended_vms_present=0
# Maximum recovery time for suspended vms in number of days.
# If this is x, vms suspended more than x amount of days would only be ignored by the check for x+1 days when unsuspended.
suspended_vms_max_recovery=10
suspended_vms_max_ignore=$((${suspended_vms_max_recovery} + 1))
pbackupcache="/var/cache/prlctl_backup_list.cache"
if [ -f /etc/vz/backup_location ]; then
backup_server=$(cat /etc/vz/backup_location)
else
backup_server=$(grep "^10.*backup-local" /etc/fstab | awk -F: '{print $1}')
fi
declare -A total_snapshots_critical total_snapshots_warning recent_snapshots_critical recent_snapshots_warning skipped_vms_count
declare -a vms_to_skip_from_check
# Relaxed limits for unmanaged vz7 nodes
if `hostname|grep -qE "vz[7|m]-(t)?vu[0-9]"`; then
recent_snapshots_critical_limit=1
recent_snapshots_warning_limit=2
total_snapshots_critical_limit=3
total_snapshots_warning_limit=5
else
recent_snapshots_critical_limit=2
recent_snapshots_warning_limit=3
total_snapshots_critical_limit=5
total_snapshots_warning_limit=7
fi
# Keep track of suspended vms by adding a log with their ctids and making a touch file to keep track of when it was last logged so we don't log every run
date_check="/var/log/suspended_vms"
if [[ ! -e "$file_path" ]]; then
touch "$date_check"
fi
if [ $(find "$date_check" -mtime +1 | wc -l) -gt 0 ]; then
echo $(date) > "$date_check"
echo -e "$(date +%Y-%m-%d\ %H:%M:%S) suspended vms skipped: $(prlctl list -a |grep suspended | awk '{print $1 " "}'| tr -d '\n')" >> /var/log/vzbackup.log
fi
# Search the 2 most recent vzctl logs for new vms (vms created within the last 10 days)
new_containers=$(cat $(ls -1t /var/log/vzctl.log*|head -2|tr '\n' ' ')|awk '/Creating.*Container/ {cmd="date -d "$1" +%s";cmd|getline datestamp;print datestamp,$5}' |awk -v eda="${tendaysago}" '$1>eda {print "{"$2"}"}')
# Get backup list from cache, or generate if missing (very slow)
if [ -s "${pbackupcache}" ]; then
backuplisting=$(cat "${pbackupcache}")
else
# If new Virtuozzo Remote backup location defined
if [ -f /etc/vz/backup_location ]; then
backuplisting=$(prlctl backup-list -s "root@$(cat /etc/vz/backup_location)" --backup-path "/backup/$(hostname -s)" | tee "${pbackupcache}")
else
backuplisting=$(prlctl backup-list | tee "${pbackupcache}")
fi
fi
if [ -z "${backuplisting}" ]; then
printf "vz7-backups-check - (${backup_server}) prlctl backup-list empty\n"
exit 1
fi
# Sql query to get vms created by a2hosting.com email addresses and hostname containing either 'test', 'example' or 'syseng' strings OR the email address itself containing 'test'
testvmquery="select v.hostname
from
vps v join users u join servers s
on v.uid=u.uid and v.serid=s.serid
where
u.email like '%@a2hosting.com'
and (v.hostname like '%test%' or v.hostname like '%example%' or u.email like '%test%' or v.hostname like 'syseng');"
mysql_rootpass=$(awk -F"'" '/dbpass/ {print $4}' /usr/local/virtualizor/universal.php)
testvms=$(/usr/local/emps/bin/mysql virtualizor -p${mysql_rootpass} -BNe "$testvmquery" 2>/dev/null|tr '\n' '|'|sed 's/.$/\n/')
# If there is at least one test vm, get a list of test vm uuids enclosed with curly brackets, one each line
if [ -n "$testvms" ]; then
testuuids=$(prlctl list -o uuid,hostname|grep -E "${testvms}"|awk '{print $1}')
# If there are no test vms, set testuuids to some string which doesnt match the uuid of any vm
else
testuuids="null"
fi
if [ -s /opt/ignorevmbackup ]; then
ignoredvmuuids=$(cat /opt/ignorevmbackup)
else
ignoredvmuuids="null"
fi
# Get the skipped vms with the number of days they were skipped from the vzbackup logs
skipped_vms_with_count=$(nice -n19 zgrep -i "suspended vms skipped" /var/log/vzbackup.log*|awk -F"skipped: " '{print $2}'|tr ' ' '\n'|sort -n|uniq -c|awk 'NF==2')
if [ -n "${skipped_vms_with_count}" ]; then
# If there is atleast one skipped vm that is running now, set suspended_vms_present=1
for vm in $(echo "${skipped_vms_with_count}"| awk '{print $2}');do
vmstatus=$(prlctl status $vm 2>/dev/null)
if `echo $vmstatus|grep -iqw "running"`; then
suspended_vms_present=1
break
fi
done
# If suspended_vms_present=1
if [ "${suspended_vms_present}" -eq 1 ] ; then
# For each vm that was skipped, get the skipped vm ctid and number of times it was skipped into an associative array
while read line; do
skipped_vms_count["$(echo $line|awk '{print $2}')"]=$(echo $line|awk '{print $1}')
done <<< "${skipped_vms_with_count}"
# For each vm that was skipped, get the last date on which it was skipped and see if enough days has gone past since that day
for vm in "${!skipped_vms_count[@]}"; do
last_seen_suspended=$(zgrep -im1 -h "suspended vms skipped.*${vm}" /var/log/vzbackup.log*|tr -d '['|awk '{cmd="date -d "$1" +%s";cmd|getline datestamp;print datestamp}'|sort -n|tail -1)
if [ "${skipped_vms_count[$vm]}" -lt "${suspended_vms_max_recovery}" ]; then
x=$((${skipped_vms_count[$vm]} + 1))
else
x=${suspended_vms_max_ignore}
fi
x_days_ago=$(date --date "${x} days ago" "+%s")
# If x days has not gone past since the vm was last seen suspended, add the vm to vms_to_skip_from_check array
if [ "${x_days_ago}" -lt "${last_seen_suspended}" ]; then
vms_to_skip_from_check+=( "{${vm}}" )
fi
done
fi
fi
for container in $(prlctl list|awk '!/UUID/ {print $1}'); do
# If vms in new containers list or in test vm uuids list, skip
if `printf "${new_containers}\n${testuuids}\n${ignoredvmuuids}\n"|grep -qw "${container}"`; then
continue
elif [ "${suspended_vms_present}" -eq 1 ] && `printf '%s\n' "${vms_to_skip_from_check[@]}"|grep -qw "${container}"`; then
continue
fi
snapshots=$(echo "$backuplisting"|awk -v cont="$container" '$1==cont {cmd="date -d "$4" +%s";cmd|getline datestamp;print $1,datestamp}')
total_snapshots=$(echo "${snapshots}"|awk 'NF==2'|wc -l)
if [ "${total_snapshots}" -lt "${total_snapshots_critical_limit}" ]; then
total_snapshots_critical["$container"]="${total_snapshots}"
elif [ "${total_snapshots}" -lt "${total_snapshots_warning_limit}" ]; then
total_snapshots_warning["$container"]="${total_snapshots}"
fi
recent_snapshots=$(echo "$snapshots"|awk -v fda="${fivedaysago}" '$2>fda'|wc -l)
if [ "${recent_snapshots}" -lt "${recent_snapshots_critical_limit}" ]; then
recent_snapshots_critical["$container"]="${recent_snapshots}"
elif [ "${recent_snapshots}" -lt "${recent_snapshots_warning_limit}" ]; then
recent_snapshots_warning["$container"]="${recent_snapshots}"
fi
done
if [ "${#total_snapshots_critical[@]}" -gt 0 ] || [ "${#recent_snapshots_critical[@]}" -gt 0 ]; then
if [ "${#total_snapshots_critical[@]}" -gt 0 ]; then
msg="Total Snapshots Critical:"
for container in "${!total_snapshots_critical[@]}"; do
container_ip=$(prlctl list "$container"|awk '/^{/ {print $3'})
msg="$msg ${container_ip}:${total_snapshots_critical[$container]}"
done
elif [ "${#recent_snapshots_critical[@]}" -gt 0 ]; then
msg="Recent Snapshots Critical:"
for container in "${!recent_snapshots_critical[@]}"; do
container_ip=$(prlctl list "$container"|awk '/^{/ {print $3'})
msg="$msg ${container_ip}:${recent_snapshots_critical[$container]}"
done
fi
printf "vz7-backups-check - (${backup_server}) $msg\n"
exit 2
elif [ "${#total_snapshots_warning[@]}" -gt 0 ] || [ "${#recent_snapshots_warning[@]}" -gt 0 ]; then
if [ "${#total_snapshots_warning[@]}" -gt 0 ]; then
msg="Total Snapshots Warning:"
for container in "${!total_snapshots_warning[@]}"; do
container_ip=$(prlctl list "$container"|awk '/^{/ {print $3'})
msg="$msg ${container_ip}:${total_snapshots_warning[$container]}"
done
elif [ "${#recent_snapshots_warning[@]}" -gt 0 ]; then
msg="Recent Snapshots Warning:"
for container in "${!recent_snapshots_warning[@]}"; do
container_ip=$(prlctl list "$container"|awk '/^{/ {print $3'})
msg="$msg ${container_ip}:${recent_snapshots_warning[$container]}"
done
fi
printf "vz7-backups-check - (${backup_server}) $msg\n"
exit 1
else
printf "vz7-backups-check - (${backup_server}) OK\n"
exit 0
fi