tractatus/scripts/framework-components/ProhibitedTermsScanner.js

/**
 * ProhibitedTermsScanner
 *
 * Proactively scans codebase for violations of inst_016/017/018
 * Part of Framework Improvement Phase 1: Proactive Content Scanning
 *
 * Usage:
 *   const scanner = new ProhibitedTermsScanner();
 *   const violations = await scanner.scan();
 *   const fixed = await scanner.autoFix(violations);
 *
 * CLI:
 *   node scripts/framework-components/ProhibitedTermsScanner.js [--details] [--fix] [--staged]
 */

const fs = require('fs').promises;
const path = require('path');
const { glob } = require('glob');
const { execSync } = require('child_process');

class ProhibitedTermsScanner {
  constructor(options = {}) {
    this.options = {
      silent: options.silent || false,
      fixMode: options.fixMode || false,
      staged: options.staged || false,
      basePath: options.basePath || process.cwd(),
      ...options
    };

    // Pattern definitions from inst_016/017/018
    this.patterns = [
      {
        id: 'inst_017',
        name: 'Absolute Assurance Terms',
        severity: 'HIGH',
        patterns: [
          /\bguarantee(?:s|d|ing)?\b/gi,
          /ensures?\s+100%/gi,
          /eliminates?\s+all\b/gi,
          /completely\s+prevents?\b/gi,
          /never\s+fails?\b/gi,
          /always\s+works?\b/gi
        ],
        suggestions: {
          'guarantee': 'enforcement',
          'guarantees': 'enforces',
          'guaranteed': 'enforced',
          'guaranteeing': 'enforcing',
          'ensures 100%': 'helps ensure',
          'ensure 100%': 'help ensure',
          'eliminates all': 'reduces',
          'eliminate all': 'reduce',
          'completely prevents': 'designed to prevent',
          'completely prevent': 'designed to prevent',
          'never fails': 'designed to prevent failures',
          'never fail': 'designed to prevent failures',
          'always works': 'designed to work',
          'always work': 'designed to work'
        }
      },
      {
        id: 'inst_016',
        name: 'Fabricated Statistics',
        severity: 'HIGH',
        patterns: [
          // Match percentage claims without [NEEDS VERIFICATION] or source citations
          /\b\d+%\s+(?:faster|better|improvement|increase|decrease|reduction|more|less)\b(?!\s*\[NEEDS VERIFICATION\]|\s*\(source:|\s*\[source:)/gi,
          /\b(?:faster|better|improvement|increase|decrease|reduction)\s+of\s+\d+%\b(?!\s*\[NEEDS VERIFICATION\]|\s*\(source:|\s*\[source:)/gi
        ],
        suggestions: {
          'default': 'Add [NEEDS VERIFICATION] or cite source'
        }
      },
      {
        id: 'inst_018',
        name: 'Unverified Readiness Claims',
        severity: 'MEDIUM',
        patterns: [
          /\bproduction-ready\b(?!\s+development\s+tool|\s+proof-of-concept)/gi,
          /\bbattle-tested\b/gi,
          /\benterprise-proven\b/gi,
          /\bwidespread\s+adoption\b/gi,
          /\bcustomer\s+base\b(?!\s+of\s+zero|\s+\(none\))/gi,
          /\bmarket\s+validation\b(?!\s+pending|\s+not\s+yet)/gi
        ],
        suggestions: {
          'production-ready': 'proof-of-concept',
          'battle-tested': 'in development',
          'enterprise-proven': 'designed for',
          'widespread adoption': 'early development',
          'customer base': 'development project',
          'market validation': 'internal validation'
        }
      }
    ];

    // File inclusion patterns
    this.includePatterns = [
      '**/*.md',
      '**/*.html',
      '**/*.js',
      '**/*.json',
      '**/*.jsx',
      '**/*.tsx'
    ];

    // File exclusion patterns
    this.excludePatterns = [
      '**/node_modules/**',
      '**/.git/**',
      '**/.claude/**',
      '**/tests/**/*.test.js',
      '**/tests/**/*.spec.js',
      '**/docs/case-studies/**',
      '**/GOVERNANCE-RULE-LIBRARY.md',
      '**/.claude/instruction-history.json',
      '**/dist/**',
      '**/build/**',
      '**/.next/**'
    ];
  }

  /**
   * Scan files for prohibited terms
   * @param {Object} options - Scan options
   * @returns {Promise<Array>} Array of violations
   */
  async scan(options = {}) {
    const scanOptions = { ...this.options, ...options };
    const violations = [];

    // Get files to scan
    const files = await this.getFilesToScan(scanOptions.staged);

    if (!scanOptions.silent) {
      console.log(`\n🔍 Scanning ${files.length} files for prohibited terms...`);
    }

    // Scan each file
    for (const file of files) {
      try {
        const content = await fs.readFile(file, 'utf8');
        const lines = content.split('\n');

        // Check each pattern type
        for (const patternSet of this.patterns) {
          for (const pattern of patternSet.patterns) {
            lines.forEach((line, index) => {
              const matches = line.match(pattern);
              if (matches) {
                matches.forEach(match => {
                  // Skip if in allowed context
                  if (this.isAllowedContext(line, match, file)) {
                    return;
                  }

                  violations.push({
                    file,
                    line: index + 1,
                    match,
                    rule: patternSet.id,
                    ruleName: patternSet.name,
                    severity: patternSet.severity,
                    context: line.trim(),
                    suggestion: this.getSuggestion(match, patternSet.suggestions)
                  });
                });
              }
            });
          }
        }
      } catch (err) {
        // Skip files that can't be read (binary files, etc.)
        if (err.code !== 'ENOENT') {
          console.error(`⚠ Error reading ${file}: ${err.message}`);
        }
      }
    }

    return violations;
  }

  /**
   * Auto-fix simple violations
   * @param {Array} violations - Violations to fix
   * @returns {Promise<Object>} Fix results
   */
  async autoFix(violations) {
    const results = {
      fixed: 0,
      total: violations.length,
      skipped: 0,
      errors: []
    };

    // Group violations by file
    const fileGroups = violations.reduce((acc, v) => {
      if (!acc[v.file]) acc[v.file] = [];
      acc[v.file].push(v);
      return acc;
    }, {});

    // Fix each file
    for (const [file, fileViolations] of Object.entries(fileGroups)) {
      try {
        let content = await fs.readFile(file, 'utf8');
        let modified = false;

        // Apply fixes (reverse order to preserve line numbers)
        for (const violation of fileViolations.reverse()) {
          // Only auto-fix if we have a clear suggestion
          if (violation.suggestion && violation.suggestion !== 'Add [NEEDS VERIFICATION] or cite source') {
            const originalContent = content;

            // Simple case-preserving replacement
            const regex = new RegExp(this.escapeRegex(violation.match), 'g');
            content = content.replace(regex, violation.suggestion);

            if (content !== originalContent) {
              modified = true;
              results.fixed++;
            }
          } else {
            results.skipped++;
          }
        }

        // Write file if modified
        if (modified) {
          await fs.writeFile(file, content, 'utf8');
          console.log(`✓ Fixed ${file}`);
        }
      } catch (err) {
        results.errors.push({ file, error: err.message });
        console.error(`✗ Error fixing ${file}: ${err.message}`);
      }
    }

    return results;
  }

  /**
   * Get files to scan
   * @param {boolean} stagedOnly - Only scan staged files
   * @returns {Promise<Array>} Array of file paths
   */
  async getFilesToScan(stagedOnly = false) {
    if (stagedOnly) {
      try {
        const output = execSync('git diff --cached --name-only', { encoding: 'utf8' });
        return output.split('\n').filter(f => f.trim());
      } catch (err) {
        console.error('⚠ Error getting staged files, falling back to all files');
      }
    }

    // Use glob to find all matching files
    const files = [];
    for (const pattern of this.includePatterns) {
      try {
        const matches = await glob(pattern, {
          ignore: this.excludePatterns,
          nodir: true,
          cwd: this.options.basePath
        });
        // glob returns an array, so we can spread it
        if (Array.isArray(matches)) {
          // Prepend base path to make absolute paths
          const absolutePaths = matches.map(f => path.join(this.options.basePath, f));
          files.push(...absolutePaths);
        }
      } catch (err) {
        // Ignore glob errors (e.g., pattern doesn't match anything)
      }
    }

    // Remove duplicates
    return [...new Set(files)];
  }

  /**
   * Check if context allows the term
   * @param {string} line - Line containing match
   * @param {string} match - Matched term
   * @param {string} file - File path
   * @returns {boolean} True if allowed
   */
  isAllowedContext(line, match, file) {
    // Allow in comments about the rules themselves
    if (line.includes('inst_017') || line.includes('inst_016') || line.includes('inst_018')) {
      return true;
    }

    // Allow in GOVERNANCE-RULE-LIBRARY.md
    if (file.includes('GOVERNANCE-RULE-LIBRARY.md')) {
      return true;
    }

    // Allow in case studies
    if (file.includes('case-studies')) {
      return true;
    }

    // Allow in test files (shouldn't reach here but double-check)
    if (file.includes('.test.') || file.includes('.spec.')) {
      return true;
    }

    // Allow "production-ready development tool" or "production-ready proof-of-concept"
    if (match.toLowerCase() === 'production-ready') {
      if (line.includes('development tool') || line.includes('proof-of-concept')) {
        return true;
      }
    }

    return false;
  }

  /**
   * Get suggestion for a match
   * @param {string} match - Matched term
   * @param {Object} suggestions - Suggestion map
   * @returns {string} Suggestion
   */
  getSuggestion(match, suggestions) {
    const lowerMatch = match.toLowerCase();

    // Try exact match first
    if (suggestions[lowerMatch]) {
      return suggestions[lowerMatch];
    }

    // Try partial matches
    for (const [key, value] of Object.entries(suggestions)) {
      if (lowerMatch.includes(key)) {
        return value;
      }
    }

    return suggestions.default || 'Review and revise';
  }

  /**
   * Format violations for display
   * @param {Array} violations - Violations to format
   * @param {boolean} detailed - Show detailed output
   * @returns {string} Formatted output
   */
  formatViolations(violations, detailed = false) {
    if (violations.length === 0) {
      return '\n✅ No prohibited terms found\n';
    }

    // Group by rule
    const byRule = violations.reduce((acc, v) => {
      if (!acc[v.rule]) acc[v.rule] = [];
      acc[v.rule].push(v);
      return acc;
    }, {});

    let output = `\n⚠  Found ${violations.length} violation(s):\n`;

    // Summary
    for (const [rule, items] of Object.entries(byRule)) {
      output += `   ${rule}: ${items.length} violation(s)\n`;
    }

    // Details
    if (detailed) {
      output += '\nDetails:\n';
      for (const v of violations) {
        output += `\n  ${v.file}:${v.line}\n`;
        output += `    Rule: ${v.rule} (${v.severity})\n`;
        output += `    Found: "${v.match}"\n`;
        output += `    Context: ${v.context.substring(0, 80)}...\n`;
        output += `    Suggestion: ${v.suggestion}\n`;
      }
    } else {
      output += '\nRun with --details for full violation list\n';
    }

    output += '\nTo fix: node scripts/framework-components/ProhibitedTermsScanner.js --fix\n';

    return output;
  }

  /**
   * Escape regex special characters
   * @param {string} str - String to escape
   * @returns {string} Escaped string
   */
  escapeRegex(str) {
    return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
  }
}

// CLI interface
async function main() {
  const args = process.argv.slice(2);
  const options = {
    silent: false,
    fixMode: args.includes('--fix'),
    staged: args.includes('--staged'),
    details: args.includes('--details')
  };

  const scanner = new ProhibitedTermsScanner(options);

  console.log('\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
  console.log('  Tractatus Framework - Prohibited Terms Scanner');
  console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');

  const violations = await scanner.scan();

  console.log(scanner.formatViolations(violations, options.details));

  if (options.fixMode && violations.length > 0) {
    console.log('\n🔧 Applying auto-fixes...\n');
    const results = await scanner.autoFix(violations);
    console.log(`\n✓ Fixed: ${results.fixed}`);
    console.log(`⊘ Skipped: ${results.skipped} (manual review required)`);
    if (results.errors.length > 0) {
      console.log(`✗ Errors: ${results.errors.length}`);
    }
  }

  // Exit with error code if violations found (for pre-commit hooks)
  process.exit(violations.length > 0 ? 1 : 0);
}

// Run if called directly
if (require.main === module) {
  main().catch(err => {
    console.error('Error:', err);
    process.exit(1);
  });
}

module.exports = ProhibitedTermsScanner;