tractatus/scripts/monitoring/health-check.sh
TheFlow ac2db33732 fix(submissions): restructure Economist package and fix article display
- Create Economist SubmissionTracking package correctly:
  * mainArticle = full blog post content
  * coverLetter = 216-word SIR— letter
  * Links to blog post via blogPostId
- Archive 'Letter to The Economist' from blog posts (it's the cover letter)
- Fix date display on article cards (use published_at)
- Target publication already displaying via blue badge

Database changes:
- Make blogPostId optional in SubmissionTracking model
- Economist package ID: 68fa85ae49d4900e7f2ecd83
- Le Monde package ID: 68fa2abd2e6acd5691932150

Next: Enhanced modal with tabs, validation, export

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-24 08:47:42 +13:00

269 lines
6.4 KiB
Bash
Executable file

#!/bin/bash
#
# Health Check Monitoring Script
# Monitors Tractatus application health endpoint and service status
#
# Usage:
# ./health-check.sh # Run check, alert if issues
# ./health-check.sh --quiet # Suppress output unless error
# ./health-check.sh --test # Test mode (no alerts)
#
# Exit codes:
# 0 = Healthy
# 1 = Health endpoint failed
# 2 = Service not running
# 3 = Configuration error
set -euo pipefail
# Configuration
HEALTH_URL="${HEALTH_URL:-https://agenticgovernance.digital/health}"
SERVICE_NAME="${SERVICE_NAME:-tractatus}"
ALERT_EMAIL="${ALERT_EMAIL:-}"
LOG_FILE="/var/log/tractatus/health-check.log"
STATE_FILE="/var/tmp/tractatus-health-state"
MAX_FAILURES=3 # Alert after 3 consecutive failures
# Parse arguments
QUIET=false
TEST_MODE=false
while [[ $# -gt 0 ]]; do
case $1 in
--quiet) QUIET=true; shift ;;
--test) TEST_MODE=true; shift ;;
*) echo "Unknown option: $1"; exit 3 ;;
esac
done
# Logging function
log() {
local level="$1"
shift
local message="$*"
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
if [[ "$QUIET" != "true" ]] || [[ "$level" == "ERROR" ]] || [[ "$level" == "CRITICAL" ]]; then
echo "[$timestamp] [$level] $message"
fi
# Log to file if directory exists
if [[ -d "$(dirname "$LOG_FILE")" ]]; then
echo "[$timestamp] [$level] $message" >> "$LOG_FILE"
fi
}
# Get current failure count
get_failure_count() {
if [[ -f "$STATE_FILE" ]]; then
cat "$STATE_FILE"
else
echo "0"
fi
}
# Increment failure count
increment_failure_count() {
local count=$(get_failure_count)
echo $((count + 1)) > "$STATE_FILE"
}
# Reset failure count
reset_failure_count() {
echo "0" > "$STATE_FILE"
}
# Send alert email
send_alert() {
local subject="$1"
local body="$2"
if [[ "$TEST_MODE" == "true" ]]; then
log "INFO" "TEST MODE: Would send alert: $subject"
return 0
fi
if [[ -z "$ALERT_EMAIL" ]]; then
log "WARN" "No alert email configured (ALERT_EMAIL not set)"
return 0
fi
# Try to send email using mail command (if available)
if command -v mail &> /dev/null; then
echo "$body" | mail -s "$subject" "$ALERT_EMAIL"
log "INFO" "Alert email sent to $ALERT_EMAIL"
elif command -v sendmail &> /dev/null; then
{
echo "Subject: $subject"
echo "From: tractatus-monitoring@agenticgovernance.digital"
echo "To: $ALERT_EMAIL"
echo ""
echo "$body"
} | sendmail "$ALERT_EMAIL"
log "INFO" "Alert email sent via sendmail to $ALERT_EMAIL"
else
log "WARN" "No email command available (install mailutils or sendmail)"
fi
}
# Check health endpoint
check_health_endpoint() {
log "INFO" "Checking health endpoint: $HEALTH_URL"
# Make HTTP request with timeout
local response
local http_code
response=$(curl -s -w "\n%{http_code}" --max-time 10 "$HEALTH_URL" 2>&1) || {
log "ERROR" "Health endpoint request failed: $response"
return 1
}
# Extract HTTP code (last line)
http_code=$(echo "$response" | tail -n 1)
# Extract response body (everything except last line)
local body=$(echo "$response" | sed '$d')
# Check HTTP status
if [[ "$http_code" != "200" ]]; then
log "ERROR" "Health endpoint returned HTTP $http_code"
return 1
fi
# Check response contains expected JSON
if ! echo "$body" | jq -e '.status == "ok"' &> /dev/null; then
log "ERROR" "Health endpoint response invalid: $body"
return 1
fi
log "INFO" "Health endpoint OK (HTTP $http_code)"
return 0
}
# Check systemd service status
check_service_status() {
log "INFO" "Checking service status: $SERVICE_NAME"
if ! systemctl is-active --quiet "$SERVICE_NAME"; then
log "ERROR" "Service $SERVICE_NAME is not active"
return 2
fi
# Check if service is enabled
if ! systemctl is-enabled --quiet "$SERVICE_NAME"; then
log "WARN" "Service $SERVICE_NAME is not enabled (won't start on boot)"
fi
log "INFO" "Service $SERVICE_NAME is active"
return 0
}
# Check database connectivity (quick MongoDB ping)
check_database() {
log "INFO" "Checking database connectivity"
# Try to connect to MongoDB (timeout 5 seconds)
if ! timeout 5 mongosh --quiet --eval "db.adminCommand('ping')" localhost:27017/tractatus_prod &> /dev/null; then
log "ERROR" "Database connection failed"
return 1
fi
log "INFO" "Database connectivity OK"
return 0
}
# Check disk space
check_disk_space() {
log "INFO" "Checking disk space"
# Get root filesystem usage percentage
local usage=$(df -h / | awk 'NR==2 {print $5}' | sed 's/%//')
if [[ "$usage" -gt 90 ]]; then
log "CRITICAL" "Disk space critical: ${usage}% used"
return 1
elif [[ "$usage" -gt 80 ]]; then
log "WARN" "Disk space high: ${usage}% used"
else
log "INFO" "Disk space OK: ${usage}% used"
fi
return 0
}
# Main health check
main() {
log "INFO" "Starting health check"
local all_healthy=true
local issues=()
# Run all checks
if ! check_service_status; then
all_healthy=false
issues+=("Service not running")
fi
if ! check_health_endpoint; then
all_healthy=false
issues+=("Health endpoint failed")
fi
if ! check_database; then
all_healthy=false
issues+=("Database connectivity failed")
fi
if ! check_disk_space; then
all_healthy=false
issues+=("Disk space issue")
fi
# Handle results
if [[ "$all_healthy" == "true" ]]; then
log "INFO" "All health checks passed ✓"
reset_failure_count
exit 0
else
log "ERROR" "Health check failed: ${issues[*]}"
increment_failure_count
local failure_count=$(get_failure_count)
log "WARN" "Consecutive failures: $failure_count/$MAX_FAILURES"
# Alert if threshold reached
if [[ "$failure_count" -ge "$MAX_FAILURES" ]]; then
local subject="[ALERT] Tractatus Health Check Failed ($failure_count failures)"
local body="Tractatus health check has failed $failure_count times consecutively.
Issues detected:
$(printf -- "- %s\n" "${issues[@]}")
Time: $(date '+%Y-%m-%d %H:%M:%S %Z')
Host: $(hostname)
Service: $SERVICE_NAME
Health URL: $HEALTH_URL
Please investigate immediately.
View logs:
sudo journalctl -u $SERVICE_NAME -n 100
Check service status:
sudo systemctl status $SERVICE_NAME
Restart service:
sudo systemctl restart $SERVICE_NAME
"
send_alert "$subject" "$body"
log "CRITICAL" "Alert sent after $failure_count consecutive failures"
fi
exit 1
fi
}
# Run main function
main