tractatus/scripts/monitoring/health-check.sh

#!/bin/bash
#
# Health Check Monitoring Script
# Monitors Tractatus application health endpoint and service status
#
# Usage:
#   ./health-check.sh                 # Run check, alert if issues
#   ./health-check.sh --quiet         # Suppress output unless error
#   ./health-check.sh --test          # Test mode (no alerts)
#
# Exit codes:
#   0 = Healthy
#   1 = Health endpoint failed
#   2 = Service not running
#   3 = Configuration error

set -euo pipefail

# Configuration
HEALTH_URL="${HEALTH_URL:-https://agenticgovernance.digital/health}"
SERVICE_NAME="${SERVICE_NAME:-tractatus}"
ALERT_EMAIL="${ALERT_EMAIL:-}"
LOG_FILE="/var/log/tractatus/health-check.log"
STATE_FILE="/var/tmp/tractatus-health-state"
MAX_FAILURES=3  # Alert after 3 consecutive failures

# Parse arguments
QUIET=false
TEST_MODE=false

while [[ $# -gt 0 ]]; do
  case $1 in
    --quiet) QUIET=true; shift ;;
    --test) TEST_MODE=true; shift ;;
    *) echo "Unknown option: $1"; exit 3 ;;
  esac
done

# Logging function
log() {
  local level="$1"
  shift
  local message="$*"
  local timestamp=$(date '+%Y-%m-%d %H:%M:%S')

  if [[ "$QUIET" != "true" ]] || [[ "$level" == "ERROR" ]] || [[ "$level" == "CRITICAL" ]]; then
    echo "[$timestamp] [$level] $message"
  fi

  # Log to file if directory exists
  if [[ -d "$(dirname "$LOG_FILE")" ]]; then
    echo "[$timestamp] [$level] $message" >> "$LOG_FILE"
  fi
}

# Get current failure count
get_failure_count() {
  if [[ -f "$STATE_FILE" ]]; then
    cat "$STATE_FILE"
  else
    echo "0"
  fi
}

# Increment failure count
increment_failure_count() {
  local count=$(get_failure_count)
  echo $((count + 1)) > "$STATE_FILE"
}

# Reset failure count
reset_failure_count() {
  echo "0" > "$STATE_FILE"
}

# Send alert email
send_alert() {
  local subject="$1"
  local body="$2"

  if [[ "$TEST_MODE" == "true" ]]; then
    log "INFO" "TEST MODE: Would send alert: $subject"
    return 0
  fi

  if [[ -z "$ALERT_EMAIL" ]]; then
    log "WARN" "No alert email configured (ALERT_EMAIL not set)"
    return 0
  fi

  # Try to send email using mail command (if available)
  if command -v mail &> /dev/null; then
    echo "$body" | mail -s "$subject" "$ALERT_EMAIL"
    log "INFO" "Alert email sent to $ALERT_EMAIL"
  elif command -v sendmail &> /dev/null; then
    {
      echo "Subject: $subject"
      echo "From: tractatus-monitoring@agenticgovernance.digital"
      echo "To: $ALERT_EMAIL"
      echo ""
      echo "$body"
    } | sendmail "$ALERT_EMAIL"
    log "INFO" "Alert email sent via sendmail to $ALERT_EMAIL"
  else
    log "WARN" "No email command available (install mailutils or sendmail)"
  fi
}

# Check health endpoint
check_health_endpoint() {
  log "INFO" "Checking health endpoint: $HEALTH_URL"

  # Make HTTP request with timeout
  local response
  local http_code

  response=$(curl -s -w "\n%{http_code}" --max-time 10 "$HEALTH_URL" 2>&1) || {
    log "ERROR" "Health endpoint request failed: $response"
    return 1
  }

  # Extract HTTP code (last line)
  http_code=$(echo "$response" | tail -n 1)

  # Extract response body (everything except last line)
  local body=$(echo "$response" | sed '$d')

  # Check HTTP status
  if [[ "$http_code" != "200" ]]; then
    log "ERROR" "Health endpoint returned HTTP $http_code"
    return 1
  fi

  # Check response contains expected JSON
  if ! echo "$body" | jq -e '.status == "ok"' &> /dev/null; then
    log "ERROR" "Health endpoint response invalid: $body"
    return 1
  fi

  log "INFO" "Health endpoint OK (HTTP $http_code)"
  return 0
}

# Check systemd service status
check_service_status() {
  log "INFO" "Checking service status: $SERVICE_NAME"

  if ! systemctl is-active --quiet "$SERVICE_NAME"; then
    log "ERROR" "Service $SERVICE_NAME is not active"
    return 2
  fi

  # Check if service is enabled
  if ! systemctl is-enabled --quiet "$SERVICE_NAME"; then
    log "WARN" "Service $SERVICE_NAME is not enabled (won't start on boot)"
  fi

  log "INFO" "Service $SERVICE_NAME is active"
  return 0
}

# Check database connectivity (quick MongoDB ping)
check_database() {
  log "INFO" "Checking database connectivity"

  # Try to connect to MongoDB (timeout 5 seconds)
  if ! timeout 5 mongosh --quiet --eval "db.adminCommand('ping')" localhost:27017/tractatus_prod &> /dev/null; then
    log "ERROR" "Database connection failed"
    return 1
  fi

  log "INFO" "Database connectivity OK"
  return 0
}

# Check disk space
check_disk_space() {
  log "INFO" "Checking disk space"

  # Get root filesystem usage percentage
  local usage=$(df -h / | awk 'NR==2 {print $5}' | sed 's/%//')

  if [[ "$usage" -gt 90 ]]; then
    log "CRITICAL" "Disk space critical: ${usage}% used"
    return 1
  elif [[ "$usage" -gt 80 ]]; then
    log "WARN" "Disk space high: ${usage}% used"
  else
    log "INFO" "Disk space OK: ${usage}% used"
  fi

  return 0
}

# Main health check
main() {
  log "INFO" "Starting health check"

  local all_healthy=true
  local issues=()

  # Run all checks
  if ! check_service_status; then
    all_healthy=false
    issues+=("Service not running")
  fi

  if ! check_health_endpoint; then
    all_healthy=false
    issues+=("Health endpoint failed")
  fi

  if ! check_database; then
    all_healthy=false
    issues+=("Database connectivity failed")
  fi

  if ! check_disk_space; then
    all_healthy=false
    issues+=("Disk space issue")
  fi

  # Handle results
  if [[ "$all_healthy" == "true" ]]; then
    log "INFO" "All health checks passed ✓"
    reset_failure_count
    exit 0
  else
    log "ERROR" "Health check failed: ${issues[*]}"
    increment_failure_count

    local failure_count=$(get_failure_count)
    log "WARN" "Consecutive failures: $failure_count/$MAX_FAILURES"

    # Alert if threshold reached
    if [[ "$failure_count" -ge "$MAX_FAILURES" ]]; then
      local subject="[ALERT] Tractatus Health Check Failed ($failure_count failures)"
      local body="Tractatus health check has failed $failure_count times consecutively.

Issues detected:
$(printf -- "- %s\n" "${issues[@]}")

Time: $(date '+%Y-%m-%d %H:%M:%S %Z')
Host: $(hostname)
Service: $SERVICE_NAME
Health URL: $HEALTH_URL

Please investigate immediately.

View logs:
sudo journalctl -u $SERVICE_NAME -n 100

Check service status:
sudo systemctl status $SERVICE_NAME

Restart service:
sudo systemctl restart $SERVICE_NAME
"

      send_alert "$subject" "$body"
      log "CRITICAL" "Alert sent after $failure_count consecutive failures"
    fi

    exit 1
  fi
}

# Run main function
main