File: //usr/lib64/nagios/plugins/check_qps_dnsdist.sh
#!/bin/bash
#
# Icinga check on DNS servers running DNSdist and
# dnsdist-query.py via cron, to detect QPS spikes
#
# Accepts argument with custom value to check for past minutes
# - default is 15 minutes
#
# Config Defaults
LOG_FILE="/var/log/dnsdist/query_monitor.log"
DEFAULT_AGE_MINUTES=15
SPIKE_MARKER="QPS spike detected!"
REST_MARKER="Traffic spike is distributed"
DOMAIN_MARKER="High query domain"
# CLI override
AGE_MINUTES=${1:-$DEFAULT_AGE_MINUTES}
MAX_AGE_SECONDS=$((AGE_MINUTES * 60))
# Check log file exists
if [ ! -f "${LOG_FILE}" ]; then
echo "UNKNOWN - Log file not found: ${LOG_FILE}"
exit 3
fi
# Get last spike line and timestamp
last_spike_line=$(grep -nF "${SPIKE_MARKER}" "${LOG_FILE}" | tail -n 1)
if [ -z "${last_spike_line}" ]; then
echo "OK - No QPS spike logged"
exit 0
fi
spike_line_no=$(echo "${last_spike_line}" | cut -d':' -f1)
spike_log_line=$(echo "${last_spike_line}" | cut -d':' -f2-)
spike_ts=$(echo "${spike_log_line}" | cut -d' ' -f1,2)
spike_epoch=$(date -d "${spike_ts}" +%s 2>/dev/null)
now_epoch=$(date +%s)
# Check if spike is within time window
if [ -z "${spike_epoch}" ] || [ $((now_epoch - spike_epoch)) -gt ${MAX_AGE_SECONDS} ]; then
echo "OK - No spike detected in the last ${AGE_MINUTES} minute(s)"
exit 0
fi
# Pull associated domain details from next few lines
next_lines=$(tail -n +"${spike_line_no}" "${LOG_FILE}" | head -n 10)
domains=$(echo "${next_lines}" | grep -F "${DOMAIN_MARKER}" | cut -d':' -f3-)
if [ -n "${domains}" ]; then
echo -e "CRITICAL - QPS spike with dominant domain(s) in past ${AGE_MINUTES} minutes:\n${domains}"
else
echo "CRITICAL - QPS spike detected in past ${AGE_MINUTES} minutes (distributed: 'Rest' traffic)"
fi
exit 2