#!/usr/bin/env node /** * Smart Section Recategorization Script * * Analyzes section content and intelligently reassigns categories based on: * - Section titles * - Excerpts * - Position in document * - Content keywords * * Categories: * - critical: Warnings, blockers, security issues, mandatory prerequisites * - conceptual: Foundational ideas, "why this matters", core principles * - practical: How-to guides, examples, step-by-step instructions * - technical: Architecture, implementation details, code examples * - reference: Appendices, glossaries, further reading, contact info * * Usage: * node scripts/recategorize-sections.js --dry-run # Preview changes * node scripts/recategorize-sections.js # Apply changes * node scripts/recategorize-sections.js --doc=slug # Single document */ const { MongoClient } = require('mongodb'); // Parse arguments const args = process.argv.slice(2); const dryRun = args.includes('--dry-run'); const specificDoc = args.find(a => a.startsWith('--doc='))?.split('=')[1]; // Statistics const stats = { totalDocuments: 0, totalSections: 0, changed: 0, unchanged: 0, byCategory: { critical: { before: 0, after: 0 }, conceptual: { before: 0, after: 0 }, practical: { before: 0, after: 0 }, technical: { before: 0, after: 0 }, reference: { before: 0, after: 0 } }, changes: [] }; /** * Categorization rules based on content analysis */ const RULES = { critical: { keywords: [ 'security', 'warning', 'caution', 'danger', 'breaking change', 'must read first', 'before you begin', 'important notice', 'critical prerequisite', 'blockers', 'requirements' ], titlePatterns: [ /^(security|warning|caution|critical)/i, /breaking changes?/i, /requirements?$/i, /^before (you|starting)/i ], // Reserve critical for ACTUAL warnings, not "why this matters" exclude: [ 'why this matters', 'who should', 'invitation', 'bottom line', 'key finding', 'introduction' ] }, conceptual: { keywords: [ 'understanding', 'concept', 'principle', 'theory', 'foundation', 'why', 'what is', 'introduction', 'overview', 'core idea', 'key finding', 'philosophy', 'paradigm', 'mental model', 'thinking', 'perspective' ], titlePatterns: [ /^(understanding|why|what is|introduction|overview)/i, /concept(s)?$/i, /principle(s)?$/i, /foundation(s)?$/i, /key (finding|idea)/i, /bottom line/i, /who should/i ] }, practical: { keywords: [ 'guide', 'example', 'step', 'how to', 'tutorial', 'walkthrough', 'use case', 'scenario', 'getting started', 'quick start', 'implementation guide', 'hands-on', 'practical', 'workflow' ], titlePatterns: [ /^(how to|getting started|quick start|guide)/i, /step[- ]by[- ]step/i, /example(s)?$/i, /use case(s)?$/i, /walkthrough/i, /workflow/i ] }, technical: { keywords: [ 'architecture', 'implementation', 'api', 'code', 'technical', 'development', 'engineering', 'system', 'design pattern', 'algorithm', 'data structure', 'performance', 'optimization' ], titlePatterns: [ /^(architecture|technical|implementation|api|code)/i, /design$/i, /specification$/i, /^system/i, /performance/i, /optimization/i ] }, reference: { keywords: [ 'reference', 'appendix', 'glossary', 'contact', 'resources', 'further reading', 'bibliography', 'links', 'related work', 'acknowledgment', 'citation' ], titlePatterns: [ /^(reference|appendix|glossary|contact|resources)/i, /further reading/i, /related (work|resources)/i, /^(acknowledgment|citation)/i ] } }; /** * Analyze section and determine best category */ function categorizeSection(section, docTitle, sectionIndex, totalSections) { const title = (section.title || '').toLowerCase(); const excerpt = (section.excerpt || '').toLowerCase(); const content = (section.content_html || '').toLowerCase(); const combined = `${title} ${excerpt}`; const scores = { critical: 0, conceptual: 0, practical: 0, technical: 0, reference: 0 }; // Score each category based on rules for (const [category, rules] of Object.entries(RULES)) { // Check exclude patterns first (for critical) if (rules.exclude) { const hasExclude = rules.exclude.some(pattern => combined.includes(pattern.toLowerCase()) ); if (hasExclude && category === 'critical') { scores[category] = -100; // Strong penalty continue; } } // Check title patterns (strong signal) if (rules.titlePatterns) { const titleMatch = rules.titlePatterns.some(pattern => pattern.test(title)); if (titleMatch) { scores[category] += 50; } } // Check keywords in title (medium signal) const titleKeywords = rules.keywords.filter(kw => title.includes(kw)); scores[category] += titleKeywords.length * 20; // Check keywords in excerpt (weak signal) const excerptKeywords = rules.keywords.filter(kw => excerpt.includes(kw)); scores[category] += excerptKeywords.length * 5; } // Position-based adjustments if (sectionIndex === 0) { // First section usually conceptual or critical prerequisite if (title.includes('introduction') || title.includes('overview')) { scores.conceptual += 30; } } else if (sectionIndex === totalSections - 1) { // Last section often reference scores.reference += 10; } // Document context adjustments const docTitleLower = docTitle.toLowerCase(); if (docTitleLower.includes('case study') || docTitleLower.includes('incident')) { // Case studies are usually conceptual, not critical scores.conceptual += 20; scores.critical -= 30; } if (docTitleLower.includes('implementation') || docTitleLower.includes('guide')) { scores.practical += 15; } if (docTitleLower.includes('api') || docTitleLower.includes('technical')) { scores.technical += 15; } // Find category with highest score const sortedScores = Object.entries(scores).sort((a, b) => b[1] - a[1]); const bestCategory = sortedScores[0][0]; const bestScore = sortedScores[0][1]; // If all scores are very low, default to conceptual if (bestScore < 10) { return 'conceptual'; } return bestCategory; } /** * Main function */ async function main() { console.log('═══════════════════════════════════════════════════════════'); console.log(' SECTION RECATEGORIZATION'); console.log('═══════════════════════════════════════════════════════════\n'); if (dryRun) { console.log('🔍 DRY RUN MODE - No changes will be saved\n'); } if (specificDoc) { console.log(`📄 Processing single document: ${specificDoc}\n`); } // Connect to MongoDB console.log('📡 Connecting to MongoDB...'); const client = await MongoClient.connect('mongodb://localhost:27017/tractatus_dev'); const db = client.db(); const collection = db.collection('documents'); // Fetch documents const filter = { visibility: 'public' }; if (specificDoc) { filter.slug = specificDoc; } const docs = await collection.find(filter).sort({ order: 1 }).toArray(); console.log(`✓ Found ${docs.length} document(s)\n`); stats.totalDocuments = docs.length; // Process each document for (const doc of docs) { if (!doc.sections || doc.sections.length === 0) { console.log(`${doc.title}: No sections (skipping)\n`); continue; } console.log(`\n${'='.repeat(70)}`); console.log(`${doc.title}`); console.log(`${'='.repeat(70)}\n`); const updates = []; stats.totalSections += doc.sections.length; doc.sections.forEach((section, index) => { const oldCategory = section.category || 'conceptual'; const newCategory = categorizeSection(section, doc.title, index, doc.sections.length); stats.byCategory[oldCategory].before++; stats.byCategory[newCategory].after++; if (oldCategory !== newCategory) { stats.changed++; console.log(`[${index + 1}/${doc.sections.length}] ${section.title}`); console.log(` ${oldCategory} → ${newCategory}`); updates.push({ sectionIndex: index, oldCategory, newCategory, title: section.title }); stats.changes.push({ document: doc.title, section: section.title, from: oldCategory, to: newCategory }); } else { stats.unchanged++; } }); // Apply updates if not dry run if (!dryRun && updates.length > 0) { const updateOperations = updates.map(update => { return { updateOne: { filter: { _id: doc._id }, update: { $set: { [`sections.${update.sectionIndex}.category`]: update.newCategory } } } }; }); await collection.bulkWrite(updateOperations); console.log(`\n✓ Applied ${updates.length} changes to database`); } else if (updates.length > 0) { console.log(`\n🔍 Would apply ${updates.length} changes (dry-run)`); } else { console.log(`\n✓ No changes needed`); } } // Summary console.log('\n\n═══════════════════════════════════════════════════════════'); console.log(' RECATEGORIZATION SUMMARY'); console.log('═══════════════════════════════════════════════════════════\n'); console.log(`Documents processed: ${stats.totalDocuments}`); console.log(`Total sections: ${stats.totalSections}`); console.log(`Changed: ${stats.changed} (${Math.round(stats.changed / stats.totalSections * 100)}%)`); console.log(`Unchanged: ${stats.unchanged} (${Math.round(stats.unchanged / stats.totalSections * 100)}%)\n`); console.log('Category changes:'); for (const [category, counts] of Object.entries(stats.byCategory)) { const change = counts.after - counts.before; const changeStr = change > 0 ? `+${change}` : change.toString(); const changePercent = counts.before > 0 ? Math.round((change / counts.before) * 100) : 0; console.log(` ${category}: ${counts.before} → ${counts.after} (${changeStr}, ${changePercent > 0 ? '+' : ''}${changePercent}%)`); } if (dryRun) { console.log('\n🔍 DRY RUN COMPLETE - No changes saved'); console.log(' Run without --dry-run to apply changes\n'); } else { console.log('\n✅ RECATEGORIZATION COMPLETE\n'); } await client.close(); } // Run main().catch(err => { console.error('\n❌ Fatal error:', err.message); console.error(err.stack); process.exit(1); });