File: //usr/lib64/nagios/plugins/check_a2_cagefs_503_errors
#!/bin/bash
# This script checks for CageFS issues where sites return 503 until CageFS is remounted
# Refactored to check for mount errors in the system log and su errors on cPanel user accounts
# Refactored to add logging and cagefs restart
log_file="/var/log/check_503.log"
log_message() {
local message="$1"
echo "$(date '+%b %d %T') $message" >> "$log_file"
}
if which cagefsctl &> /dev/null; then
msg_ok="0 check_503 - OK"
msg_crit="2 check_503 - Possible 503 errors situation, resetting CageFS."
domain_count=0
error_count=0
su_error_count=0
# Function to check mounts error in last 15 minutes.
check_mount_error() {
last_mount_error=$(grep "reached the limit on mounts" /var/log/messages | tail -1)
if [ -n "$last_mount_error" ]; then
mount_error_timestamp=$(echo "$last_mount_error" | awk '{print $1, $2, $3}')
epoch_timestamp=$(date -d "$mount_error_timestamp" +%s)
current_epoch=$(date +%s)
time_diff=$((current_epoch - epoch_timestamp))
if [ "$time_diff" -le 900 ]; then
log_message "Mount errors found in system log within last 15 minutes"
return 0
fi
fi
return 1
}
# Function to check su on each cPanel user account.
check_su_error() {
users_list=$(whmapi1 list_users --output=jsonpretty | jq -r '.data.users[]' | grep -v root)
if [ $? -ne 0 ] || [ -z "$users_list" ]; then
return 1
fi
total_num_of_users=$(echo "$users_list" | wc -l)
for cpuser in $users_list; do
if ! su - "$cpuser" -s /bin/sh -c "exit" &>/dev/null; then
su_error_count=$((su_error_count + 1))
fi
done
if [ "$su_error_count" -eq "$total_num_of_users" ]; then
log_message "su attempt failed for $su_error_count/$total_num_of_users users."
return 0
else
return 1
fi
}
if check_mount_error || check_su_error; then
# Loop through each domain and check if its giving 503 error
domain=$(whmapi1 --output=jsonpretty get_domain_info | jq -r '.data.domains[].domain')
for cur_domain in $domain; do
domain_count=$((domain_count + 1))
ip=$(whmapi1 --output=jsonpretty get_domain_info | jq -r --arg domain "$cur_domain" '.data.domains[] | select(.domain == $domain) | .ipv4')
status_code=$(curl --connect-timeout 2 -sL -o /dev/null -w "%{http_code}" "$cur_domain" --connect-to ""$cur_domain":80:$ip")
if [ "$status_code" -eq 301 ]; then
status_code=$(curl --connect-timeout 2 -sL -o /dev/null -w "%{http_code}" "$cur_domain" --connect-to ""$cur_domain":443:$ip")
fi
if [ "$status_code" -eq 503 ]; then
error_count=$((error_count + 1))
fi
done
if [ "$error_count" -gt 0 ]; then
log_message "$cur_domain showing 503 error."
log_message "$msg_crit"
echo "$msg_crit"
exit 2
systemctl restart cagefs
/usr/sbin/cagefsctl --disable-all
/usr/sbin/cagefsctl --enable-all
else
echo "$msg_ok"
exit 0
fi
else
echo "$msg_ok"
exit 0
fi
fi