tractatus/scripts/monitoring/monitor-all.sh
TheFlow c755c49ec1 ops: implement comprehensive production monitoring system
Create self-hosted, privacy-first monitoring infrastructure for production
environment with automated health checks, log analysis, and alerting.

Monitoring Components:
- health-check.sh: Application health, service status, DB connectivity, disk space
- log-monitor.sh: Error detection, security events, anomaly detection
- disk-monitor.sh: Disk space usage monitoring (5 paths)
- ssl-monitor.sh: SSL certificate expiry monitoring
- monitor-all.sh: Master orchestration script

Features:
- Email alerting system (configurable thresholds)
- Consecutive failure tracking (prevents false positives)
- Test mode for safe deployment testing
- Comprehensive logging to /var/log/tractatus/
- Cron-ready for automated execution
- Exit codes for monitoring tool integration

Alert Triggers:
- Health: 3 consecutive failures (15min downtime)
- Logs: 10 errors OR 3 critical errors in 5min
- Disk: 80% warning, 90% critical
- SSL: 30 days warning, 7 days critical

Setup Documentation:
- Complete installation instructions
- Cron configuration examples
- Systemd timer alternative
- Troubleshooting guide
- Alert customization guide
- Incident response procedures

Privacy-First Design:
- Self-hosted (no external monitoring services)
- Minimal data exposure in alerts
- Local log storage only
- No telemetry to third parties

Aligns with Tractatus values: transparency, privacy, operational excellence

Addresses Phase 4 Prep Checklist Task #6: Production Monitoring & Alerting

Next: Deploy to production, configure email alerts, set up cron jobs

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-09 22:23:40 +13:00

178 lines
4.2 KiB
Bash
Executable file

#!/bin/bash
#
# Master Monitoring Script
# Orchestrates all monitoring checks for Tractatus production environment
#
# Usage:
# ./monitor-all.sh # Run all monitors
# ./monitor-all.sh --test # Test mode (no alerts)
# ./monitor-all.sh --skip-ssl # Skip SSL check
#
# Exit codes:
# 0 = All checks passed
# 1 = Some warnings
# 2 = Some critical issues
# 3 = Configuration error
set -euo pipefail
# Configuration
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
LOG_FILE="/var/log/tractatus/monitoring.log"
ALERT_EMAIL="${ALERT_EMAIL:-}"
# Parse arguments
TEST_MODE=false
SKIP_SSL=false
while [[ $# -gt 0 ]]; do
case $1 in
--test)
TEST_MODE=true
shift
;;
--skip-ssl)
SKIP_SSL=true
shift
;;
*)
echo "Unknown option: $1"
exit 3
;;
esac
done
# Export configuration for child scripts
export ALERT_EMAIL
[[ "$TEST_MODE" == "true" ]] && TEST_FLAG="--test" || TEST_FLAG=""
# Logging function
log() {
local level="$1"
shift
local message="$*"
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
echo "[$timestamp] [$level] $message"
if [[ -d "$(dirname "$LOG_FILE")" ]]; then
echo "[$timestamp] [$level] $message" >> "$LOG_FILE"
fi
}
# Run monitoring check
run_check() {
local name="$1"
local script="$2"
shift 2
local args="$@"
log "INFO" "Running $name..."
local exit_code=0
"$SCRIPT_DIR/$script" $args $TEST_FLAG || exit_code=$?
case $exit_code in
0)
log "INFO" "$name: OK ✓"
;;
1)
log "WARN" "$name: Warning"
;;
2)
log "CRITICAL" "$name: Critical"
;;
*)
log "ERROR" "$name: Error (exit code: $exit_code)"
;;
esac
return $exit_code
}
# Main monitoring function
main() {
log "INFO" "=== Starting Tractatus Monitoring Suite ==="
log "INFO" "Timestamp: $(date '+%Y-%m-%d %H:%M:%S %Z')"
log "INFO" "Host: $(hostname)"
[[ "$TEST_MODE" == "true" ]] && log "INFO" "TEST MODE: Alerts suppressed"
local max_severity=0
local checks_run=0
local checks_passed=0
local checks_warned=0
local checks_critical=0
local checks_failed=0
# Health Check
if run_check "Health Check" "health-check.sh"; then
((checks_passed++))
else
local exit_code=$?
[[ $exit_code -eq 1 ]] && ((checks_warned++))
[[ $exit_code -eq 2 ]] && ((checks_critical++))
[[ $exit_code -ge 3 ]] && ((checks_failed++))
[[ $exit_code -gt $max_severity ]] && max_severity=$exit_code
fi
((checks_run++))
# Log Monitor
if run_check "Log Monitor" "log-monitor.sh" --since "5 minutes ago"; then
((checks_passed++))
else
local exit_code=$?
[[ $exit_code -eq 1 ]] && ((checks_warned++))
[[ $exit_code -eq 2 ]] && ((checks_critical++))
[[ $exit_code -ge 3 ]] && ((checks_failed++))
[[ $exit_code -gt $max_severity ]] && max_severity=$exit_code
fi
((checks_run++))
# Disk Monitor
if run_check "Disk Monitor" "disk-monitor.sh"; then
((checks_passed++))
else
local exit_code=$?
[[ $exit_code -eq 1 ]] && ((checks_warned++))
[[ $exit_code -eq 2 ]] && ((checks_critical++))
[[ $exit_code -ge 3 ]] && ((checks_failed++))
[[ $exit_code -gt $max_severity ]] && max_severity=$exit_code
fi
((checks_run++))
# SSL Monitor (optional)
if [[ "$SKIP_SSL" != "true" ]]; then
if run_check "SSL Monitor" "ssl-monitor.sh"; then
((checks_passed++))
else
local exit_code=$?
[[ $exit_code -eq 1 ]] && ((checks_warned++))
[[ $exit_code -eq 2 ]] && ((checks_critical++))
[[ $exit_code -ge 3 ]] && ((checks_failed++))
[[ $exit_code -gt $max_severity ]] && max_severity=$exit_code
fi
((checks_run++))
fi
# Summary
log "INFO" "=== Monitoring Summary ==="
log "INFO" "Checks run: $checks_run"
log "INFO" "Passed: $checks_passed | Warned: $checks_warned | Critical: $checks_critical | Failed: $checks_failed"
if [[ $max_severity -eq 0 ]]; then
log "INFO" "All monitoring checks passed ✓"
elif [[ $max_severity -eq 1 ]]; then
log "WARN" "Some checks returned warnings"
elif [[ $max_severity -eq 2 ]]; then
log "CRITICAL" "Some checks returned critical alerts"
else
log "ERROR" "Some checks failed"
fi
log "INFO" "=== Monitoring Complete ==="
exit $max_severity
}
# Run main function
main