tractatus/scripts/recategorize-sections.js

#!/usr/bin/env node

/**
 * Smart Section Recategorization Script
 *
 * Analyzes section content and intelligently reassigns categories based on:
 * - Section titles
 * - Excerpts
 * - Position in document
 * - Content keywords
 *
 * Categories:
 * - critical: Warnings, blockers, security issues, mandatory prerequisites
 * - conceptual: Foundational ideas, "why this matters", core principles
 * - practical: How-to guides, examples, step-by-step instructions
 * - technical: Architecture, implementation details, code examples
 * - reference: Appendices, glossaries, further reading, contact info
 *
 * Usage:
 *   node scripts/recategorize-sections.js --dry-run    # Preview changes
 *   node scripts/recategorize-sections.js              # Apply changes
 *   node scripts/recategorize-sections.js --doc=slug   # Single document
 */

const { MongoClient } = require('mongodb');

// Parse arguments
const args = process.argv.slice(2);
const dryRun = args.includes('--dry-run');
const specificDoc = args.find(a => a.startsWith('--doc='))?.split('=')[1];

// Statistics
const stats = {
  totalDocuments: 0,
  totalSections: 0,
  changed: 0,
  unchanged: 0,
  byCategory: {
    critical: { before: 0, after: 0 },
    conceptual: { before: 0, after: 0 },
    practical: { before: 0, after: 0 },
    technical: { before: 0, after: 0 },
    reference: { before: 0, after: 0 }
  },
  changes: []
};

/**
 * Categorization rules based on content analysis
 */
const RULES = {
  critical: {
    keywords: [
      'security', 'warning', 'caution', 'danger', 'breaking change',
      'must read first', 'before you begin', 'important notice',
      'critical prerequisite', 'blockers', 'requirements'
    ],
    titlePatterns: [
      /^(security|warning|caution|critical)/i,
      /breaking changes?/i,
      /requirements?$/i,
      /^before (you|starting)/i
    ],
    // Reserve critical for ACTUAL warnings, not "why this matters"
    exclude: [
      'why this matters', 'who should', 'invitation', 'bottom line',
      'key finding', 'introduction'
    ]
  },

  conceptual: {
    keywords: [
      'understanding', 'concept', 'principle', 'theory', 'foundation',
      'why', 'what is', 'introduction', 'overview', 'core idea',
      'key finding', 'philosophy', 'paradigm', 'mental model',
      'thinking', 'perspective'
    ],
    titlePatterns: [
      /^(understanding|why|what is|introduction|overview)/i,
      /concept(s)?$/i,
      /principle(s)?$/i,
      /foundation(s)?$/i,
      /key (finding|idea)/i,
      /bottom line/i,
      /who should/i
    ]
  },

  practical: {
    keywords: [
      'guide', 'example', 'step', 'how to', 'tutorial', 'walkthrough',
      'use case', 'scenario', 'getting started', 'quick start',
      'implementation guide', 'hands-on', 'practical', 'workflow'
    ],
    titlePatterns: [
      /^(how to|getting started|quick start|guide)/i,
      /step[- ]by[- ]step/i,
      /example(s)?$/i,
      /use case(s)?$/i,
      /walkthrough/i,
      /workflow/i
    ]
  },

  technical: {
    keywords: [
      'architecture', 'implementation', 'api', 'code', 'technical',
      'development', 'engineering', 'system', 'design pattern',
      'algorithm', 'data structure', 'performance', 'optimization'
    ],
    titlePatterns: [
      /^(architecture|technical|implementation|api|code)/i,
      /design$/i,
      /specification$/i,
      /^system/i,
      /performance/i,
      /optimization/i
    ]
  },

  reference: {
    keywords: [
      'reference', 'appendix', 'glossary', 'contact', 'resources',
      'further reading', 'bibliography', 'links', 'related work',
      'acknowledgment', 'citation'
    ],
    titlePatterns: [
      /^(reference|appendix|glossary|contact|resources)/i,
      /further reading/i,
      /related (work|resources)/i,
      /^(acknowledgment|citation)/i
    ]
  }
};

/**
 * Analyze section and determine best category
 */
function categorizeSection(section, docTitle, sectionIndex, totalSections) {
  const title = (section.title || '').toLowerCase();
  const excerpt = (section.excerpt || '').toLowerCase();
  const content = (section.content_html || '').toLowerCase();
  const combined = `${title} ${excerpt}`;

  const scores = {
    critical: 0,
    conceptual: 0,
    practical: 0,
    technical: 0,
    reference: 0
  };

  // Score each category based on rules
  for (const [category, rules] of Object.entries(RULES)) {
    // Check exclude patterns first (for critical)
    if (rules.exclude) {
      const hasExclude = rules.exclude.some(pattern =>
        combined.includes(pattern.toLowerCase())
      );
      if (hasExclude && category === 'critical') {
        scores[category] = -100; // Strong penalty
        continue;
      }
    }

    // Check title patterns (strong signal)
    if (rules.titlePatterns) {
      const titleMatch = rules.titlePatterns.some(pattern => pattern.test(title));
      if (titleMatch) {
        scores[category] += 50;
      }
    }

    // Check keywords in title (medium signal)
    const titleKeywords = rules.keywords.filter(kw => title.includes(kw));
    scores[category] += titleKeywords.length * 20;

    // Check keywords in excerpt (weak signal)
    const excerptKeywords = rules.keywords.filter(kw => excerpt.includes(kw));
    scores[category] += excerptKeywords.length * 5;
  }

  // Position-based adjustments
  if (sectionIndex === 0) {
    // First section usually conceptual or critical prerequisite
    if (title.includes('introduction') || title.includes('overview')) {
      scores.conceptual += 30;
    }
  } else if (sectionIndex === totalSections - 1) {
    // Last section often reference
    scores.reference += 10;
  }

  // Document context adjustments
  const docTitleLower = docTitle.toLowerCase();
  if (docTitleLower.includes('case study') || docTitleLower.includes('incident')) {
    // Case studies are usually conceptual, not critical
    scores.conceptual += 20;
    scores.critical -= 30;
  }
  if (docTitleLower.includes('implementation') || docTitleLower.includes('guide')) {
    scores.practical += 15;
  }
  if (docTitleLower.includes('api') || docTitleLower.includes('technical')) {
    scores.technical += 15;
  }

  // Find category with highest score
  const sortedScores = Object.entries(scores).sort((a, b) => b[1] - a[1]);
  const bestCategory = sortedScores[0][0];
  const bestScore = sortedScores[0][1];

  // If all scores are very low, default to conceptual
  if (bestScore < 10) {
    return 'conceptual';
  }

  return bestCategory;
}

/**
 * Main function
 */
async function main() {
  console.log('═══════════════════════════════════════════════════════════');
  console.log('  SECTION RECATEGORIZATION');
  console.log('═══════════════════════════════════════════════════════════\n');

  if (dryRun) {
    console.log('🔍 DRY RUN MODE - No changes will be saved\n');
  }

  if (specificDoc) {
    console.log(`📄 Processing single document: ${specificDoc}\n`);
  }

  // Connect to MongoDB
  console.log('📡 Connecting to MongoDB...');
  const client = await MongoClient.connect('mongodb://localhost:27017/tractatus_dev');
  const db = client.db();
  const collection = db.collection('documents');

  // Fetch documents
  const filter = { visibility: 'public' };
  if (specificDoc) {
    filter.slug = specificDoc;
  }

  const docs = await collection.find(filter).sort({ order: 1 }).toArray();
  console.log(`✓ Found ${docs.length} document(s)\n`);

  stats.totalDocuments = docs.length;

  // Process each document
  for (const doc of docs) {
    if (!doc.sections || doc.sections.length === 0) {
      console.log(`${doc.title}: No sections (skipping)\n`);
      continue;
    }

    console.log(`\n${'='.repeat(70)}`);
    console.log(`${doc.title}`);
    console.log(`${'='.repeat(70)}\n`);

    const updates = [];
    stats.totalSections += doc.sections.length;

    doc.sections.forEach((section, index) => {
      const oldCategory = section.category || 'conceptual';
      const newCategory = categorizeSection(section, doc.title, index, doc.sections.length);

      stats.byCategory[oldCategory].before++;
      stats.byCategory[newCategory].after++;

      if (oldCategory !== newCategory) {
        stats.changed++;
        console.log(`[${index + 1}/${doc.sections.length}] ${section.title}`);
        console.log(`  ${oldCategory} → ${newCategory}`);

        updates.push({
          sectionIndex: index,
          oldCategory,
          newCategory,
          title: section.title
        });

        stats.changes.push({
          document: doc.title,
          section: section.title,
          from: oldCategory,
          to: newCategory
        });
      } else {
        stats.unchanged++;
      }
    });

    // Apply updates if not dry run
    if (!dryRun && updates.length > 0) {
      const updateOperations = updates.map(update => {
        return {
          updateOne: {
            filter: { _id: doc._id },
            update: {
              $set: {
                [`sections.${update.sectionIndex}.category`]: update.newCategory
              }
            }
          }
        };
      });

      await collection.bulkWrite(updateOperations);
      console.log(`\n✓ Applied ${updates.length} changes to database`);
    } else if (updates.length > 0) {
      console.log(`\n🔍 Would apply ${updates.length} changes (dry-run)`);
    } else {
      console.log(`\n✓ No changes needed`);
    }
  }

  // Summary
  console.log('\n\n═══════════════════════════════════════════════════════════');
  console.log('  RECATEGORIZATION SUMMARY');
  console.log('═══════════════════════════════════════════════════════════\n');

  console.log(`Documents processed: ${stats.totalDocuments}`);
  console.log(`Total sections: ${stats.totalSections}`);
  console.log(`Changed: ${stats.changed} (${Math.round(stats.changed / stats.totalSections * 100)}%)`);
  console.log(`Unchanged: ${stats.unchanged} (${Math.round(stats.unchanged / stats.totalSections * 100)}%)\n`);

  console.log('Category changes:');
  for (const [category, counts] of Object.entries(stats.byCategory)) {
    const change = counts.after - counts.before;
    const changeStr = change > 0 ? `+${change}` : change.toString();
    const changePercent = counts.before > 0
      ? Math.round((change / counts.before) * 100)
      : 0;

    console.log(`  ${category}: ${counts.before} → ${counts.after} (${changeStr}, ${changePercent > 0 ? '+' : ''}${changePercent}%)`);
  }

  if (dryRun) {
    console.log('\n🔍 DRY RUN COMPLETE - No changes saved');
    console.log('   Run without --dry-run to apply changes\n');
  } else {
    console.log('\n✅ RECATEGORIZATION COMPLETE\n');
  }

  await client.close();
}

// Run
main().catch(err => {
  console.error('\n❌ Fatal error:', err.message);
  console.error(err.stack);
  process.exit(1);
});