Files
jabali-panel/bin/jabali-health-monitor
2026-01-24 19:36:46 +02:00

514 lines
15 KiB
PHP
Executable File

#!/usr/bin/env php
<?php
declare(strict_types=1);
/**
* Jabali Health Monitor
*
* Monitors critical services and automatically restarts them if they fail.
* Monitors server load and sends alerts when load is high for extended periods.
* Sends notifications to admin on service failures and recoveries.
*/
define('LOG_FILE', '/var/log/jabali/health-monitor.log');
define('STATE_FILE', '/var/run/jabali/health-monitor.state');
define('CHECK_INTERVAL', 30); // seconds
define('DEFAULT_LOAD_THRESHOLD', 5.0);
define('DEFAULT_LOAD_ALERT_MINUTES', 5);
define('DEFAULT_QUEUE_STUCK_SECONDS', 90);
define('DEFAULT_QUEUE_RESTART_COOLDOWN', 300);
// Critical services to monitor
$criticalServices = [
'nginx' => [
'description' => 'Web Server',
'check_command' => 'systemctl is-active nginx',
'restart_command' => 'systemctl reload nginx || systemctl start nginx',
],
'mariadb' => [
'description' => 'Database Server',
'check_command' => 'systemctl is-active mariadb',
'restart_command' => 'systemctl restart mariadb',
],
'jabali-agent' => [
'description' => 'Jabali Agent',
'check_command' => 'systemctl is-active jabali-agent',
'restart_command' => 'systemctl restart jabali-agent',
],
'jabali-queue' => [
'description' => 'Queue Worker',
'check_command' => 'systemctl is-active jabali-queue',
'restart_command' => 'systemctl restart jabali-queue',
],
];
// Optional services - only monitor if installed
$optionalServices = [
'php8.4-fpm' => [
'description' => 'PHP-FPM',
'check_command' => 'systemctl is-active php8.4-fpm',
'restart_command' => 'systemctl reload php8.4-fpm || systemctl start php8.4-fpm',
'fallback' => ['php8.3-fpm', 'php8.2-fpm', 'php8.1-fpm'],
],
'postfix' => [
'description' => 'Mail Transfer Agent',
'check_command' => 'systemctl is-active postfix',
'restart_command' => 'systemctl restart postfix',
],
'dovecot' => [
'description' => 'IMAP/POP3 Server',
'check_command' => 'systemctl is-active dovecot',
'restart_command' => 'systemctl restart dovecot',
],
'named' => [
'description' => 'DNS Server',
'check_command' => 'systemctl is-active named',
'restart_command' => 'systemctl restart named',
'fallback' => ['bind9'],
],
'redis-server' => [
'description' => 'Redis Cache',
'check_command' => 'systemctl is-active redis-server',
'restart_command' => 'systemctl restart redis-server',
'fallback' => ['redis'],
],
'fail2ban' => [
'description' => 'Intrusion Prevention',
'check_command' => 'systemctl is-active fail2ban',
'restart_command' => 'systemctl restart fail2ban',
],
];
/**
* Log a message to the log file
*/
function logger(string $message, string $level = 'INFO'): void
{
$timestamp = date('Y-m-d H:i:s');
$logMessage = "[$timestamp] [$level] $message\n";
file_put_contents(LOG_FILE, $logMessage, FILE_APPEND);
// Also output to console for systemd journald
echo $logMessage;
}
/**
* Load service state from file
*/
function loadState(): array
{
if (file_exists(STATE_FILE)) {
$content = file_get_contents(STATE_FILE);
$state = json_decode($content, true);
if (is_array($state)) {
return $state;
}
}
return [];
}
/**
* Save service state to file
*/
function saveState(array $state): void
{
file_put_contents(STATE_FILE, json_encode($state, JSON_PRETTY_PRINT));
}
/**
* Check if a service is running
*/
function isServiceRunning(string $checkCommand): bool
{
exec($checkCommand . ' 2>/dev/null', $output, $returnCode);
return $returnCode === 0;
}
/**
* Check if a service exists/is installed
*/
function isServiceInstalled(string $service): bool
{
exec("systemctl list-unit-files {$service}.service 2>/dev/null | grep -q {$service}", $output, $returnCode);
return $returnCode === 0;
}
/**
* Find the correct PHP-FPM service name
*/
function findPhpFpmService(): ?string
{
$versions = ['8.4', '8.3', '8.2', '8.1', '8.0'];
foreach ($versions as $version) {
$service = "php{$version}-fpm";
if (isServiceInstalled($service)) {
return $service;
}
}
return null;
}
/**
* Restart a service
*/
function restartService(string $restartCommand): bool
{
exec($restartCommand . ' 2>&1', $output, $returnCode);
sleep(2); // Give service time to start
return $returnCode === 0;
}
/**
* Parse a Laravel .env file properly
*/
function parseEnvFile(string $path): array
{
$env = [];
$lines = file($path, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
foreach ($lines as $line) {
// Skip comments
$line = trim($line);
if (empty($line) || $line[0] === '#') {
continue;
}
// Split on first = only
$pos = strpos($line, '=');
if ($pos === false) {
continue;
}
$key = trim(substr($line, 0, $pos));
$value = substr($line, $pos + 1);
// Remove surrounding quotes if present
$value = trim($value);
if ((str_starts_with($value, '"') && str_ends_with($value, '"')) ||
(str_starts_with($value, "'") && str_ends_with($value, "'"))) {
$value = substr($value, 1, -1);
}
$env[$key] = $value;
}
return $env;
}
/**
* Send notification via Laravel's notification system
*/
function sendNotification(string $event, string $service, string $description): void
{
$artisan = '/var/www/jabali/artisan';
if (!file_exists($artisan)) {
return;
}
$cmd = sprintf(
'cd /var/www/jabali && /usr/bin/php artisan notify:service-health %s %s --description=%s 2>&1',
escapeshellarg($event),
escapeshellarg($service),
escapeshellarg($description)
);
exec($cmd, $output, $returnCode);
if ($returnCode === 0) {
logger("Notification sent via Laravel: {$event} - {$service}", 'INFO');
}
}
/**
* Send high load notification via Laravel
*/
function sendLoadNotification(string $event, float $load, int $minutes): void
{
$artisan = '/var/www/jabali/artisan';
if (!file_exists($artisan)) {
return;
}
$cmd = sprintf(
'cd /var/www/jabali && /usr/bin/php artisan notify:high-load %s %s %d 2>&1',
escapeshellarg($event),
escapeshellarg((string) $load),
$minutes
);
exec($cmd, $output, $returnCode);
if ($returnCode === 0) {
logger("Load notification sent via Laravel: {$event} - load: {$load}", 'INFO');
}
}
/**
* Get current system load average (1 minute)
*/
function getLoadAverage(): float
{
$load = sys_getloadavg();
return $load[0] ?? 0.0;
}
/**
* Get load monitoring settings from database
*/
function getLoadSettings(): array
{
$artisan = '/var/www/jabali/artisan';
if (!file_exists($artisan)) {
return [
'enabled' => true,
'threshold' => DEFAULT_LOAD_THRESHOLD,
'minutes' => DEFAULT_LOAD_ALERT_MINUTES,
];
}
$cmd = 'cd /var/www/jabali && /usr/bin/php artisan tinker --execute="
use App\Models\DnsSetting;
echo json_encode([
\'enabled\' => (bool) DnsSetting::get(\'notify_high_load\', true),
\'threshold\' => (float) DnsSetting::get(\'load_threshold\', ' . DEFAULT_LOAD_THRESHOLD . '),
\'minutes\' => (int) DnsSetting::get(\'load_alert_minutes\', ' . DEFAULT_LOAD_ALERT_MINUTES . '),
]);
" 2>/dev/null';
exec($cmd, $output, $returnCode);
if ($returnCode === 0 && !empty($output)) {
$lastLine = end($output);
$settings = json_decode($lastLine, true);
if (is_array($settings)) {
return $settings;
}
}
return [
'enabled' => true,
'threshold' => DEFAULT_LOAD_THRESHOLD,
'minutes' => DEFAULT_LOAD_ALERT_MINUTES,
];
}
/**
* Check server load and send alerts if necessary
*/
function checkServerLoad(array &$state): void
{
$settings = getLoadSettings();
if (!$settings['enabled']) {
// Reset load state if monitoring is disabled
if (isset($state['load'])) {
unset($state['load']);
}
return;
}
$currentLoad = getLoadAverage();
$threshold = $settings['threshold'];
$alertMinutes = $settings['minutes'];
$alertSeconds = $alertMinutes * 60;
$loadState = $state['load'] ?? [
'high_since' => null,
'alert_sent' => false,
'last_load' => 0.0,
];
if ($currentLoad >= $threshold) {
// Load is high
if ($loadState['high_since'] === null) {
// First time detecting high load
$loadState['high_since'] = time();
$loadState['alert_sent'] = false;
logger("Server load is high: {$currentLoad} (threshold: {$threshold})", 'WARNING');
} else {
// Load has been high for a while
$highDuration = time() - $loadState['high_since'];
$highMinutes = (int) floor($highDuration / 60);
if ($highDuration >= $alertSeconds && !$loadState['alert_sent']) {
// Send alert
logger("Server load has been high ({$currentLoad}) for {$highMinutes} minutes - sending alert", 'WARNING');
sendLoadNotification('high', $currentLoad, $highMinutes);
$loadState['alert_sent'] = true;
}
}
} else {
// Load is normal
if ($loadState['high_since'] !== null && $loadState['alert_sent']) {
// Load recovered after alert was sent
logger("Server load recovered: {$currentLoad} (was high since " . date('H:i:s', $loadState['high_since']) . ")", 'INFO');
sendLoadNotification('recovered', $currentLoad, 0);
}
$loadState['high_since'] = null;
$loadState['alert_sent'] = false;
}
$loadState['last_load'] = $currentLoad;
$loadState['last_check'] = time();
$state['load'] = $loadState;
}
/**
* Build the list of services to monitor
*/
function buildServiceList(array $criticalServices, array $optionalServices): array
{
$services = $criticalServices;
// Find and add PHP-FPM
$phpFpm = findPhpFpmService();
if ($phpFpm) {
$services[$phpFpm] = [
'description' => 'PHP-FPM',
'check_command' => "systemctl is-active {$phpFpm}",
'restart_command' => "systemctl reload {$phpFpm} || systemctl start {$phpFpm}",
];
}
// Add optional services if installed
foreach ($optionalServices as $name => $config) {
// Skip PHP-FPM as we handle it specially above
if (strpos($name, 'php') === 0 && strpos($name, 'fpm') !== false) {
continue;
}
if (isServiceInstalled($name)) {
$services[$name] = $config;
} elseif (isset($config['fallback'])) {
// Check fallback service names
foreach ($config['fallback'] as $fallback) {
if (isServiceInstalled($fallback)) {
$services[$fallback] = [
'description' => $config['description'],
'check_command' => "systemctl is-active {$fallback}",
'restart_command' => "systemctl restart {$fallback}",
];
break;
}
}
}
}
return $services;
}
/**
* Check and heal a single service
*/
function checkAndHealService(string $name, array $config, array &$state): void
{
$isRunning = isServiceRunning($config['check_command']);
$wasDown = $state[$name]['down'] ?? false;
$restartAttempts = $state[$name]['restart_attempts'] ?? 0;
$maxAttempts = 3;
if ($isRunning) {
// Service is running
if ($wasDown) {
// Service recovered
logger("{$config['description']} ({$name}) recovered", 'INFO');
sendNotification('recovered', $name, $config['description']);
$state[$name] = ['down' => false, 'restart_attempts' => 0, 'last_check' => time()];
}
} else {
// Service is down
if (!$wasDown) {
// First time noticing it's down
logger("{$config['description']} ({$name}) is DOWN", 'WARNING');
sendNotification('down', $name, $config['description']);
$state[$name] = ['down' => true, 'restart_attempts' => 0, 'down_since' => time(), 'last_check' => time()];
}
if ($restartAttempts < $maxAttempts) {
// Try to restart
logger("Attempting to restart {$name} (attempt " . ($restartAttempts + 1) . "/$maxAttempts)", 'INFO');
$success = restartService($config['restart_command']);
$state[$name]['restart_attempts'] = $restartAttempts + 1;
$state[$name]['last_restart_attempt'] = time();
if ($success && isServiceRunning($config['check_command'])) {
logger("{$name} restarted successfully", 'INFO');
sendNotification('restarted', $name, $config['description']);
$state[$name] = ['down' => false, 'restart_attempts' => 0, 'last_check' => time()];
} else {
logger("{$name} restart failed", 'ERROR');
}
} elseif ($restartAttempts == $maxAttempts) {
// Max attempts reached, send critical notification
logger("{$name} failed to restart after $maxAttempts attempts - manual intervention required", 'ERROR');
sendNotification('failed', $name, $config['description']);
$state[$name]['restart_attempts'] = $maxAttempts + 1; // Prevent repeated notifications
}
}
$state[$name]['last_check'] = time();
}
/**
* Main monitoring loop
*/
function main(): void
{
global $criticalServices, $optionalServices;
// Ensure log directory exists
$logDir = dirname(LOG_FILE);
if (!is_dir($logDir)) {
mkdir($logDir, 0755, true);
}
// Ensure state directory exists
$stateDir = dirname(STATE_FILE);
if (!is_dir($stateDir)) {
mkdir($stateDir, 0755, true);
}
logger("Jabali Health Monitor starting", 'INFO');
// Build service list
$services = buildServiceList($criticalServices, $optionalServices);
logger("Monitoring " . count($services) . " services: " . implode(', ', array_keys($services)), 'INFO');
// Load previous state
$state = loadState();
// Main loop
while (true) {
// Check services
foreach ($services as $name => $config) {
checkAndHealService($name, $config, $state);
}
// Check server load
checkServerLoad($state);
// Save state
saveState($state);
// Wait for next check interval
sleep(CHECK_INTERVAL);
}
}
// Handle signals for graceful shutdown
pcntl_async_signals(true);
pcntl_signal(SIGTERM, function () {
logger("Received SIGTERM, shutting down", 'INFO');
exit(0);
});
pcntl_signal(SIGINT, function () {
logger("Received SIGINT, shutting down", 'INFO');
exit(0);
});
// Run the monitor
main();