File: //lib64/nagios/plugins/check_backuply.py
#!/usr/bin/env python3
"""
Backuply Health Check Plugin for Icinga
Description:
- Checks if backup servers are configured
- Verifies backup jobs exist
- Monitors recent task execution (--chours / --whours)
- Detects failed backup tasks
Usage:
./check_backuply.py [--chours 24] [--whours 12]
Exit Codes:
0 OK
1 WARNING
2 CRITICAL
3 UNKNOWN
"""
import os
import sys
import shutil
import argparse
import time
import sqlite3
import json
import subprocess
def check_backup_servers():
"""Check if backup servers are configured"""
config_path = "/var/backuply/conf/backup_servers.json"
if not os.path.exists(config_path):
print(f"CRITICAL: Backup servers config not found at {config_path}")
return 2
try:
with open(config_path, 'r') as f:
backup_servers = json.load(f)
except (json.JSONDecodeError, IOError) as e:
print(f"CRITICAL: Failed to read backup servers config: {e}")
return 2
if not backup_servers:
print("CRITICAL: No backup servers configured")
return 2
print(f"OK: {len(backup_servers)} backup server(s) configured")
return 0
def check_backup_jobs():
"""Check if backup jobs are configured"""
config_path = "/var/backuply/conf/backup.json"
if not os.path.exists(config_path):
print(f"CRITICAL: Backup jobs config not found at {config_path}")
return 2
try:
with open(config_path, 'r') as f:
backup_jobs = json.load(f)
except (json.JSONDecodeError, IOError) as e:
print(f"CRITICAL: Failed to read backup jobs config: {e}")
return 2
if not backup_jobs:
print("CRITICAL: No backup jobs configured")
return 2
active_jobs = [job for job in backup_jobs if job.get('schedule_status') == 1]
if not active_jobs:
print("WARNING: No active backup jobs found")
return 1
print(f"OK: {len(active_jobs)} active backup job(s) configured")
return 0
def check_recent_tasks():
"""Check recent task execution and detect failures from SQLite database"""
parser = argparse.ArgumentParser(add_help=False)
parser.add_argument("--chours", type=int, default=24, help="Critical hours threshold (default: 24)")
parser.add_argument("--whours", type=int, default=12, help="Warning hours threshold (default: 12)")
args, _ = parser.parse_known_args()
def get_current_users_count():
try:
if os.path.exists("/usr/local/cpanel/cpanel"):
command = "whmapi1 --output=jsonpretty get_current_users_count"
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
output, _ = process.communicate()
data = json.loads(output)
return data['data']['users']
except:
pass
users_count = get_current_users_count()
critical_hours = args.chours
warning_hours = args.whours
db_path = "/var/backuply/db/tasks.db"
if not os.path.exists(db_path):
print(f"WARNING: Database not found at {db_path}")
return 1
now = int(time.time())
critical_threshold = now - (critical_hours * 3600)
warning_threshold = now - (warning_hours * 3600)
try:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute("""
SELECT actid, uuid, action, status_txt, status, progress, created, started, updated, ended
FROM tasks
WHERE created > ? AND action LIKE '%creating_backup_%'
ORDER BY created DESC
""", (critical_threshold,))
recent_tasks = cursor.fetchall()
conn.close()
except sqlite3.Error as e:
print(f"CRITICAL: Failed to query database: {e}")
return 2
if not recent_tasks and users_count is not None and users_count > 0:
print(f"CRITICAL: No tasks executed in last {critical_hours}h but {users_count} user(s) exist")
return 2
failed_tasks = []
for task in recent_tasks:
actid, uuid, action, status_txt, status, progress, created, _, _, _ = task
if status != 1:
failed_tasks.append({
'id': actid,
'uuid': uuid,
'created': created,
'status': status,
'status_txt': status_txt or f'Status {status}',
'task': action or 'Unknown',
'progress': progress or 0
})
critical_failures = [t for t in failed_tasks if t['created'] > warning_threshold]
warning_failures = [t for t in failed_tasks if warning_threshold >= t['created'] > critical_threshold]
if critical_failures:
failed_tasks_info = ", ".join([f"{t['task']}({t['status_txt']})" for t in critical_failures[:3]])
print(f"CRITICAL: {len(critical_failures)} failed/running task(s) in last {critical_hours}h: {failed_tasks_info}")
return 2
elif warning_failures:
failed_tasks_info = ", ".join([f"{t['task']}({t['status_txt']})" for t in warning_failures[:3]])
print(f"WARNING: {len(warning_failures)} failed/running task(s) in last {warning_hours}h: {failed_tasks_info}")
return 1
else:
print(f"OK: {len(recent_tasks)} task(s) in last {critical_hours}h, no failures detected{f', {users_count} users' if users_count is not None else ', no users'}")
return 0
if __name__ == "__main__":
if not shutil.which("backuply"):
print("OK: Backuply is not installed on this server")
sys.exit(0)
checks = [check_backup_servers, check_backup_jobs, check_recent_tasks]
for check_fn in checks:
result = check_fn()
if result != 0:
sys.exit(result)