tractatus/scripts/monitoring/log-monitor.sh
TheFlow c755c49ec1 ops: implement comprehensive production monitoring system
Create self-hosted, privacy-first monitoring infrastructure for production
environment with automated health checks, log analysis, and alerting.

Monitoring Components:
- health-check.sh: Application health, service status, DB connectivity, disk space
- log-monitor.sh: Error detection, security events, anomaly detection
- disk-monitor.sh: Disk space usage monitoring (5 paths)
- ssl-monitor.sh: SSL certificate expiry monitoring
- monitor-all.sh: Master orchestration script

Features:
- Email alerting system (configurable thresholds)
- Consecutive failure tracking (prevents false positives)
- Test mode for safe deployment testing
- Comprehensive logging to /var/log/tractatus/
- Cron-ready for automated execution
- Exit codes for monitoring tool integration

Alert Triggers:
- Health: 3 consecutive failures (15min downtime)
- Logs: 10 errors OR 3 critical errors in 5min
- Disk: 80% warning, 90% critical
- SSL: 30 days warning, 7 days critical

Setup Documentation:
- Complete installation instructions
- Cron configuration examples
- Systemd timer alternative
- Troubleshooting guide
- Alert customization guide
- Incident response procedures

Privacy-First Design:
- Self-hosted (no external monitoring services)
- Minimal data exposure in alerts
- Local log storage only
- No telemetry to third parties

Aligns with Tractatus values: transparency, privacy, operational excellence

Addresses Phase 4 Prep Checklist Task #6: Production Monitoring & Alerting

Next: Deploy to production, configure email alerts, set up cron jobs

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-09 22:23:40 +13:00

269 lines
6.7 KiB
Bash
Executable file

#!/bin/bash
#
# Log Monitoring Script
# Monitors Tractatus service logs for errors, security events, and anomalies
#
# Usage:
# ./log-monitor.sh # Monitor logs since last check
# ./log-monitor.sh --since "1 hour" # Monitor specific time window
# ./log-monitor.sh --follow # Continuous monitoring
# ./log-monitor.sh --test # Test mode (no alerts)
#
# Exit codes:
# 0 = No issues found
# 1 = Errors detected
# 2 = Critical errors detected
# 3 = Configuration error
set -euo pipefail
# Configuration
SERVICE_NAME="${SERVICE_NAME:-tractatus}"
ALERT_EMAIL="${ALERT_EMAIL:-}"
LOG_FILE="/var/log/tractatus/log-monitor.log"
STATE_FILE="/var/tmp/tractatus-log-monitor-state"
ERROR_THRESHOLD=10 # Alert after 10 errors in window
CRITICAL_THRESHOLD=3 # Alert immediately after 3 critical errors
# Parse arguments
SINCE="5 minutes ago"
FOLLOW=false
TEST_MODE=false
while [[ $# -gt 0 ]]; do
case $1 in
--since)
SINCE="$2"
shift 2
;;
--follow)
FOLLOW=true
shift
;;
--test)
TEST_MODE=true
shift
;;
*)
echo "Unknown option: $1"
exit 3
;;
esac
done
# Logging function
log() {
local level="$1"
shift
local message="$*"
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
echo "[$timestamp] [$level] $message"
# Log to file if directory exists
if [[ -d "$(dirname "$LOG_FILE")" ]]; then
echo "[$timestamp] [$level] $message" >> "$LOG_FILE"
fi
}
# Send alert email
send_alert() {
local subject="$1"
local body="$2"
if [[ "$TEST_MODE" == "true" ]]; then
log "INFO" "TEST MODE: Would send alert: $subject"
return 0
fi
if [[ -z "$ALERT_EMAIL" ]]; then
log "WARN" "No alert email configured (ALERT_EMAIL not set)"
return 0
fi
if command -v mail &> /dev/null; then
echo "$body" | mail -s "$subject" "$ALERT_EMAIL"
log "INFO" "Alert email sent to $ALERT_EMAIL"
elif command -v sendmail &> /dev/null; then
{
echo "Subject: $subject"
echo "From: tractatus-monitoring@agenticgovernance.digital"
echo "To: $ALERT_EMAIL"
echo ""
echo "$body"
} | sendmail "$ALERT_EMAIL"
log "INFO" "Alert email sent via sendmail to $ALERT_EMAIL"
else
log "WARN" "No email command available"
fi
}
# Extract errors from logs
extract_errors() {
local since="$1"
# Get logs since specified time
sudo journalctl -u "$SERVICE_NAME" --since "$since" --no-pager 2>/dev/null || {
log "ERROR" "Failed to read journal for $SERVICE_NAME"
return 1
}
}
# Analyze log patterns
analyze_logs() {
local logs="$1"
# Count different severity levels
local error_count=$(echo "$logs" | grep -ci "\[ERROR\]" || echo "0")
local critical_count=$(echo "$logs" | grep -ci "\[CRITICAL\]" || echo "0")
local warn_count=$(echo "$logs" | grep -ci "\[WARN\]" || echo "0")
# Security-related patterns
local security_count=$(echo "$logs" | grep -ciE "(SECURITY|unauthorized|forbidden|authentication failed)" || echo "0")
# Database errors
local db_error_count=$(echo "$logs" | grep -ciE "(mongodb|database|connection.*failed)" || echo "0")
# HTTP errors
local http_error_count=$(echo "$logs" | grep -ciE "HTTP.*50[0-9]|Internal Server Error" || echo "0")
# Unhandled exceptions
local exception_count=$(echo "$logs" | grep -ciE "(Unhandled.*exception|TypeError|ReferenceError)" || echo "0")
log "INFO" "Log analysis: CRITICAL=$critical_count ERROR=$error_count WARN=$warn_count SECURITY=$security_count DB_ERROR=$db_error_count HTTP_ERROR=$http_error_count EXCEPTION=$exception_count"
# Determine severity
if [[ "$critical_count" -ge "$CRITICAL_THRESHOLD" ]]; then
log "CRITICAL" "Critical error threshold exceeded: $critical_count critical errors"
return 2
fi
if [[ "$error_count" -ge "$ERROR_THRESHOLD" ]]; then
log "ERROR" "Error threshold exceeded: $error_count errors"
return 1
fi
if [[ "$security_count" -gt 0 ]]; then
log "WARN" "Security events detected: $security_count events"
fi
if [[ "$db_error_count" -gt 5 ]]; then
log "WARN" "Database errors detected: $db_error_count errors"
fi
if [[ "$exception_count" -gt 0 ]]; then
log "WARN" "Unhandled exceptions detected: $exception_count exceptions"
fi
return 0
}
# Extract top error messages
get_top_errors() {
local logs="$1"
local limit="${2:-10}"
echo "$logs" | grep -iE "\[ERROR\]|\[CRITICAL\]" | \
sed 's/^.*\] //' | \
sort | uniq -c | sort -rn | head -n "$limit"
}
# Main monitoring function
main() {
log "INFO" "Starting log monitoring (since: $SINCE)"
# Extract logs
local logs
logs=$(extract_errors "$SINCE") || {
log "ERROR" "Failed to extract logs"
exit 3
}
# Count total log entries
local log_count=$(echo "$logs" | wc -l)
log "INFO" "Analyzing $log_count log entries"
if [[ "$log_count" -eq 0 ]]; then
log "INFO" "No logs found in time window"
exit 0
fi
# Analyze logs
local exit_code=0
analyze_logs "$logs" || exit_code=$?
# If errors detected, send alert
if [[ "$exit_code" -ne 0 ]]; then
local severity="ERROR"
[[ "$exit_code" -eq 2 ]] && severity="CRITICAL"
local subject="[ALERT] Tractatus Log Monitoring - $severity Detected"
# Extract top 10 error messages
local top_errors=$(get_top_errors "$logs" 10)
local body="Log monitoring detected $severity level issues in Tractatus service.
Time Window: $SINCE
Time: $(date '+%Y-%m-%d %H:%M:%S %Z')
Host: $(hostname)
Service: $SERVICE_NAME
Top Error Messages:
$top_errors
Recent Critical/Error Logs:
$(echo "$logs" | grep -iE "\[ERROR\]|\[CRITICAL\]" | tail -n 20)
Full logs:
sudo journalctl -u $SERVICE_NAME --since \"$SINCE\"
Check service status:
sudo systemctl status $SERVICE_NAME
"
send_alert "$subject" "$body"
else
log "INFO" "No significant issues detected"
fi
exit $exit_code
}
# Follow mode (continuous monitoring)
follow_logs() {
log "INFO" "Starting continuous log monitoring"
sudo journalctl -u "$SERVICE_NAME" -f --no-pager | while read -r line; do
# Check for error patterns
if echo "$line" | grep -qiE "\[ERROR\]|\[CRITICAL\]"; then
log "ERROR" "$line"
# Extract error message
local error_msg=$(echo "$line" | sed 's/^.*\] //')
# Check for critical patterns
if echo "$line" | grep -qiE "\[CRITICAL\]|Unhandled.*exception|Database.*failed|Service.*crashed"; then
local subject="[CRITICAL] Tractatus Error Detected"
local body="Critical error detected in Tractatus logs:
$line
Time: $(date '+%Y-%m-%d %H:%M:%S %Z')
Host: $(hostname)
Recent logs:
$(sudo journalctl -u $SERVICE_NAME -n 10 --no-pager)
"
send_alert "$subject" "$body"
fi
fi
done
}
# Run appropriate mode
if [[ "$FOLLOW" == "true" ]]; then
follow_logs
else
main
fi