tractatus/scripts/analyze-claude-md.js

#!/usr/bin/env node

/**
 * CLAUDE.md Extraction & Analysis Script
 *
 * Extracts governance rules from CLAUDE.md files for Tractatus framework integration.
 *
 * Focuses on TWO rule layers:
 * 1. Development Environment Rules - Framework governance for Claude Code sessions
 * 2. Architectural Constraints - System-wide rules enforced at code level
 *
 * IGNORES:
 * - Tenant-specific configuration (belongs in MongoDB)
 * - Product defaults for new tenants (code constants, separate design)
 * - Credentials (belong in .env or credential vault)
 */

const fs = require('fs');
const path = require('path');

// Color output
const colors = {
  reset: '\x1b[0m',
  green: '\x1b[32m',
  yellow: '\x1b[33m',
  blue: '\x1b[34m',
  red: '\x1b[31m',
  cyan: '\x1b[36m',
  bold: '\x1b[1m',
  gray: '\x1b[90m'
};

function log(message, color = 'reset') {
  console.log(`${colors[color]}${message}${colors.reset}`);
}

function header(message) {
  console.log('');
  log('═'.repeat(80), 'cyan');
  log(`  ${message}`, 'bold');
  log('═'.repeat(80), 'cyan');
  console.log('');
}

function section(message) {
  console.log('');
  log(`▶ ${message}`, 'blue');
}

// Parse command line arguments
const args = process.argv.slice(2);
const claudeMdPath = args[0] || path.join(__dirname, '../CLAUDE.md');

if (!fs.existsSync(claudeMdPath)) {
  log(`Error: File not found: ${claudeMdPath}`, 'red');
  process.exit(1);
}

const content = fs.readFileSync(claudeMdPath, 'utf8');
const lines = content.split('\n');

header('CLAUDE.md Rule Extraction & Analysis');
log(`File: ${claudeMdPath}`, 'cyan');
log(`Lines: ${lines.length}`, 'cyan');

// Rule patterns to detect
const patterns = {
  // Imperative language
  must: /\b(MUST|ALWAYS|NEVER|REQUIRED|SHALL|PROHIBITED)\b/i,
  should: /\b(SHOULD|RECOMMENDED|AVOID|PREFER)\b/i,
  may: /\b(MAY|CAN|OPTIONAL|CONSIDER)\b/i,

  // Multi-tenant specific
  tenant: /\b(tenant|multi-tenant|tenantId|isolation)\b/i,
  gdpr: /\b(GDPR|privacy|consent|retention|data protection)\b/i,

  // Architecture patterns
  port: /\b(port\s+\d+|:\d{4,5})\b/i,
  database: /\b(MongoDB|database|collection|query)\b/i,
  deployment: /\b(deploy|deployment|production|systemd|pm2)\b/i,

  // Security patterns
  security: /\b(security|auth|credential|password|token|api key)\b/i,

  // Development patterns
  testing: /\b(test|testing|local|development|dev)\b/i,
  session: /\b(session|handoff)\b/i
};

// Extract sections
const sections = {};
let currentSection = 'preamble';
let sectionContent = [];

lines.forEach(line => {
  const heading = line.match(/^#+\s+(.+)$/);
  if (heading) {
    if (sectionContent.length > 0) {
      sections[currentSection] = sectionContent.join('\n');
    }
    currentSection = heading[1];
    sectionContent = [];
  } else {
    sectionContent.push(line);
  }
});

if (sectionContent.length > 0) {
  sections[currentSection] = sectionContent.join('\n');
}

// Analyze sections
section('1. Document Structure');
log(`  Sections found: ${Object.keys(sections).length}`, 'cyan');
Object.keys(sections).forEach(sec => {
  const lineCount = sections[sec].split('\n').length;
  log(`    - ${sec} (${lineCount} lines)`, 'gray');
});

// Extract candidate rules
section('2. Candidate Rules Extraction');

const candidates = {
  layer1_dev: [],      // Development environment rules
  layer2_arch: [],     // Architectural constraints
  ignored_creds: [],   // Credentials (should be in .env)
  ignored_config: [],  // Tenant config (should be in MongoDB)
  ignored_vague: []    // Too vague to be rules
};

lines.forEach((line, idx) => {
  const trimmed = line.trim();

  // Skip empty lines, code blocks, comments
  if (!trimmed || trimmed.startsWith('```') || trimmed.startsWith('//') || trimmed.startsWith('#')) {
    return;
  }

  // Detect imperative statements
  const hasMust = patterns.must.test(trimmed);
  const hasShould = patterns.should.test(trimmed);
  const hasMay = patterns.may.test(trimmed);

  if (!hasMust && !hasShould && !hasMay) {
    return; // Not a rule candidate
  }

  // Classify into layers
  const rule = {
    line: idx + 1,
    text: trimmed,
    imperative: hasMust ? 'MUST' : hasShould ? 'SHOULD' : 'MAY',
    patterns: []
  };

  // Detect patterns
  if (patterns.tenant.test(trimmed)) rule.patterns.push('multi-tenant');
  if (patterns.gdpr.test(trimmed)) rule.patterns.push('GDPR');
  if (patterns.port.test(trimmed)) rule.patterns.push('port');
  if (patterns.database.test(trimmed)) rule.patterns.push('database');
  if (patterns.deployment.test(trimmed)) rule.patterns.push('deployment');
  if (patterns.security.test(trimmed)) rule.patterns.push('security');
  if (patterns.testing.test(trimmed)) rule.patterns.push('testing');
  if (patterns.session.test(trimmed)) rule.patterns.push('session');

  // Classification logic

  // Credentials → ignore
  if (trimmed.match(/password|credential|admin.*@|test.*@.*:/i)) {
    candidates.ignored_creds.push(rule);
    return;
  }

  // Layer 2: Architectural constraints (multi-tenant, GDPR, security boundaries)
  if (rule.patterns.includes('multi-tenant') ||
      rule.patterns.includes('GDPR') ||
      (hasMust && rule.patterns.includes('database'))) {
    candidates.layer2_arch.push(rule);
    return;
  }

  // Layer 1: Development environment (ports, deployment, testing, sessions)
  if (rule.patterns.includes('port') ||
      rule.patterns.includes('deployment') ||
      rule.patterns.includes('testing') ||
      rule.patterns.includes('session')) {
    candidates.layer1_dev.push(rule);
    return;
  }

  // Too vague (no specific patterns)
  if (rule.patterns.length === 0 && !hasMust) {
    candidates.ignored_vague.push(rule);
    return;
  }

  // Default to Layer 1 if MUST and has some specificity
  if (hasMust) {
    candidates.layer1_dev.push(rule);
  } else {
    candidates.ignored_vague.push(rule);
  }
});

// Display Layer 1 (Development Environment)
section('3. Layer 1: Development Environment Rules');
log(`  Found ${candidates.layer1_dev.length} development rules`, 'green');
console.log('');

candidates.layer1_dev.forEach((rule, idx) => {
  log(`  ${idx + 1}. [Line ${rule.line}] ${rule.imperative}`, 'cyan');
  log(`     ${rule.text}`, 'gray');
  log(`     Patterns: ${rule.patterns.join(', ')}`, 'yellow');
  console.log('');
});

// Display Layer 2 (Architectural Constraints)
section('4. Layer 2: Architectural Constraints');
log(`  Found ${candidates.layer2_arch.length} architectural rules`, 'green');
console.log('');

candidates.layer2_arch.forEach((rule, idx) => {
  log(`  ${idx + 1}. [Line ${rule.line}] ${rule.imperative}`, 'cyan');
  log(`     ${rule.text}`, 'gray');
  log(`     Patterns: ${rule.patterns.join(', ')}`, 'yellow');
  console.log('');
});

// Display ignored items
section('5. Ignored Items');

log(`  Credentials (${candidates.ignored_creds.length}) - belong in .env or vault:`, 'yellow');
candidates.ignored_creds.forEach(rule => {
  log(`    [Line ${rule.line}] ${rule.text.substring(0, 80)}...`, 'gray');
});
console.log('');

log(`  Vague statements (${candidates.ignored_vague.length}) - not actionable rules:`, 'yellow');
candidates.ignored_vague.forEach(rule => {
  log(`    [Line ${rule.line}] ${rule.text.substring(0, 80)}...`, 'gray');
});

// Rule quality scoring
section('6. Rule Quality Analysis');

function scoreRule(rule) {
  let score = 0;

  // Imperative strength
  if (rule.imperative === 'MUST') score += 40;
  else if (rule.imperative === 'SHOULD') score += 20;
  else score += 10;

  // Specificity (has patterns)
  score += rule.patterns.length * 10;

  // Length (not too short, not too long)
  const wordCount = rule.text.split(/\s+/).length;
  if (wordCount >= 5 && wordCount <= 20) score += 20;
  else if (wordCount > 20) score += 10;

  // Has parameters (ports, paths, etc.)
  if (rule.text.match(/\d{4,5}|\/[\w/-]+|[A-Z_]{3,}/)) score += 10;

  return Math.min(100, score);
}

const allRules = [...candidates.layer1_dev, ...candidates.layer2_arch];
const scored = allRules.map(rule => ({
  ...rule,
  score: scoreRule(rule)
})).sort((a, b) => b.score - a.score);

log(`  Average quality score: ${(scored.reduce((sum, r) => sum + r.score, 0) / scored.length).toFixed(1)}/100`, 'cyan');
console.log('');

log(`  High-quality rules (score ≥ 70):`, 'green');
const highQuality = scored.filter(r => r.score >= 70);
highQuality.forEach(rule => {
  log(`    [${rule.score}] ${rule.text.substring(0, 70)}...`, 'gray');
});
console.log('');

log(`  Needs improvement (score < 70):`, 'yellow');
const needsWork = scored.filter(r => r.score < 70);
needsWork.forEach(rule => {
  log(`    [${rule.score}] ${rule.text.substring(0, 70)}...`, 'gray');
});

// Suggested improvements
section('7. Suggested Improvements');

needsWork.forEach(rule => {
  const suggestions = [];

  if (rule.imperative !== 'MUST' && rule.patterns.length > 0) {
    suggestions.push(`Change "${rule.imperative}" to "MUST" for stronger enforcement`);
  }

  if (rule.patterns.length === 0) {
    suggestions.push('Add specific parameters (ports, paths, constraints)');
  }

  const wordCount = rule.text.split(/\s+/).length;
  if (wordCount < 5) {
    suggestions.push('Add more context - why is this rule important?');
  }

  if (!rule.text.match(/\d{4,5}|\/[\w/-]+|[A-Z_]{3,}/)) {
    suggestions.push('Add concrete values (port numbers, file paths, constants)');
  }

  if (suggestions.length > 0) {
    log(`  ${rule.text}`, 'gray');
    suggestions.forEach(s => log(`    → ${s}`, 'yellow'));
    console.log('');
  }
});

// Generate instruction-history.json format
section('8. Proposed instruction-history.json Entries');

function convertToInstruction(rule, layer) {
  const quadrant = layer === 'layer2_arch' ? 'SYSTEM' :
                   rule.patterns.includes('deployment') ? 'OPERATIONAL' :
                   rule.patterns.includes('session') ? 'OPERATIONAL' : 'SYSTEM';

  const persistence = rule.imperative === 'MUST' ? 'HIGH' :
                      rule.imperative === 'SHOULD' ? 'MEDIUM' : 'LOW';

  const category = rule.patterns.includes('multi-tenant') ? 'architecture' :
                   rule.patterns.includes('security') ? 'security' :
                   rule.patterns.includes('deployment') ? 'deployment' :
                   rule.patterns.includes('testing') ? 'quality' : 'technical';

  return {
    id: `fh_${layer}_${Date.now()}_${Math.random().toString(36).substr(2, 5)}`,
    text: rule.text,
    quadrant,
    persistence,
    category,
    temporal_scope: 'PERMANENT',
    priority: rule.score >= 70 ? 90 : 70,
    source: 'claude_md_extraction',
    active: true,
    created_date: new Date().toISOString().split('T')[0],
    extracted_from: claudeMdPath,
    original_line: rule.line,
    patterns: rule.patterns,
    layer: layer === 'layer1_dev' ? 'Development Environment' : 'Architectural Constraint'
  };
}

const instructions = {
  layer1: candidates.layer1_dev.map(r => convertToInstruction(r, 'layer1_dev')),
  layer2: candidates.layer2_arch.map(r => convertToInstruction(r, 'layer2_arch'))
};

log(`  Layer 1 (Development): ${instructions.layer1.length} instructions`, 'green');
log(`  Layer 2 (Architecture): ${instructions.layer2.length} instructions`, 'green');
console.log('');

// Output JSON
const outputPath = claudeMdPath.replace('.md', '_extracted_rules.json');
const output = {
  metadata: {
    source_file: claudeMdPath,
    extracted_at: new Date().toISOString(),
    total_rules: instructions.layer1.length + instructions.layer2.length,
    layer1_count: instructions.layer1.length,
    layer2_count: instructions.layer2.length,
    average_score: (scored.reduce((sum, r) => sum + r.score, 0) / scored.length).toFixed(1)
  },
  instructions: {
    layer1_development: instructions.layer1,
    layer2_architecture: instructions.layer2
  },
  ignored: {
    credentials: candidates.ignored_creds.length,
    vague_statements: candidates.ignored_vague.length
  }
};

fs.writeFileSync(outputPath, JSON.stringify(output, null, 2));
log(`✓ Saved to: ${outputPath}`, 'green');

// Summary
section('9. Summary & Next Steps');

log(`  Total rules extracted: ${allRules.length}`, 'bold');
log(`    - Layer 1 (Development): ${candidates.layer1_dev.length}`, 'cyan');
log(`    - Layer 2 (Architecture): ${candidates.layer2_arch.length}`, 'cyan');
log(`  Ignored items: ${candidates.ignored_creds.length + candidates.ignored_vague.length}`, 'yellow');
log(`  Average quality: ${(scored.reduce((sum, r) => sum + r.score, 0) / scored.length).toFixed(1)}/100`, 'green');
console.log('');

log('  Next steps:', 'bold');
log('    1. Review extracted rules in JSON output', 'cyan');
log('    2. Manually improve low-quality rules (score < 70)', 'cyan');
log('    3. Add missing rules not detected by patterns', 'cyan');
log('    4. Import to instruction-history.json', 'cyan');
console.log('');

log('═'.repeat(80), 'cyan');