tractatus/scripts/recategorize-sections.js
TheFlow 8c22811110 feat(docs): intelligent section recategorization + i18n infrastructure
This commit includes two major improvements to the documentation system:

## 1. Section Recategorization (UX Fix)

**Problem**: 64 sections (24%) were incorrectly marked as "critical" and
displayed at the bottom of documents, burying important foundational content.

**Solution**:
- Created intelligent recategorization script analyzing titles, excerpts,
  and document context
- Reduced "critical" from 64 → 2 sections (97% reduction)
- Properly categorized content by purpose:
  - Conceptual: 63 → 138 (+119%) - foundations, "why this matters"
  - Practical: 3 → 46 (+1433%) - how-to guides, examples
  - Technical: 111 → 50 (-55%) - true implementation details

**UI Improvements**:
- Reordered category display: Critical → Conceptual → Practical → Technical → Reference
- Changed Critical color from amber to red for better visual distinction
- All 22 documents recategorized (173 sections updated)

## 2. i18n Infrastructure (Phase 2)

**Backend**:
- DeepL API integration service with quota management and error handling
- Translation API routes (GET /api/documents/:slug?lang=de, POST /api/documents/:id/translate)
- Document model already supports translations field (no schema changes)

**Frontend**:
- docs-app.js enhanced with language detection and URL parameter support
- Automatic fallback to English when translation unavailable
- Integration with existing i18n-simple.js system

**Scripts**:
- translate-all-documents.js: Batch translation workflow (dry-run support)
- audit-section-categories.js: Category distribution analysis

**URL Strategy**: Query parameter approach (?lang=de, ?lang=fr)

**Status**: Backend complete, ready for DeepL API key configuration

**Files Modified**:
- Frontend: document-cards.js, docs-app.js
- Backend: documents.controller.js, documents.routes.js, DeepL.service.js
- Scripts: 3 new governance/i18n scripts

**Database**: 173 sections recategorized via script (already applied)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-26 00:48:27 +13:00

358 lines
11 KiB
JavaScript
Executable file

#!/usr/bin/env node
/**
* Smart Section Recategorization Script
*
* Analyzes section content and intelligently reassigns categories based on:
* - Section titles
* - Excerpts
* - Position in document
* - Content keywords
*
* Categories:
* - critical: Warnings, blockers, security issues, mandatory prerequisites
* - conceptual: Foundational ideas, "why this matters", core principles
* - practical: How-to guides, examples, step-by-step instructions
* - technical: Architecture, implementation details, code examples
* - reference: Appendices, glossaries, further reading, contact info
*
* Usage:
* node scripts/recategorize-sections.js --dry-run # Preview changes
* node scripts/recategorize-sections.js # Apply changes
* node scripts/recategorize-sections.js --doc=slug # Single document
*/
const { MongoClient } = require('mongodb');
// Parse arguments
const args = process.argv.slice(2);
const dryRun = args.includes('--dry-run');
const specificDoc = args.find(a => a.startsWith('--doc='))?.split('=')[1];
// Statistics
const stats = {
totalDocuments: 0,
totalSections: 0,
changed: 0,
unchanged: 0,
byCategory: {
critical: { before: 0, after: 0 },
conceptual: { before: 0, after: 0 },
practical: { before: 0, after: 0 },
technical: { before: 0, after: 0 },
reference: { before: 0, after: 0 }
},
changes: []
};
/**
* Categorization rules based on content analysis
*/
const RULES = {
critical: {
keywords: [
'security', 'warning', 'caution', 'danger', 'breaking change',
'must read first', 'before you begin', 'important notice',
'critical prerequisite', 'blockers', 'requirements'
],
titlePatterns: [
/^(security|warning|caution|critical)/i,
/breaking changes?/i,
/requirements?$/i,
/^before (you|starting)/i
],
// Reserve critical for ACTUAL warnings, not "why this matters"
exclude: [
'why this matters', 'who should', 'invitation', 'bottom line',
'key finding', 'introduction'
]
},
conceptual: {
keywords: [
'understanding', 'concept', 'principle', 'theory', 'foundation',
'why', 'what is', 'introduction', 'overview', 'core idea',
'key finding', 'philosophy', 'paradigm', 'mental model',
'thinking', 'perspective'
],
titlePatterns: [
/^(understanding|why|what is|introduction|overview)/i,
/concept(s)?$/i,
/principle(s)?$/i,
/foundation(s)?$/i,
/key (finding|idea)/i,
/bottom line/i,
/who should/i
]
},
practical: {
keywords: [
'guide', 'example', 'step', 'how to', 'tutorial', 'walkthrough',
'use case', 'scenario', 'getting started', 'quick start',
'implementation guide', 'hands-on', 'practical', 'workflow'
],
titlePatterns: [
/^(how to|getting started|quick start|guide)/i,
/step[- ]by[- ]step/i,
/example(s)?$/i,
/use case(s)?$/i,
/walkthrough/i,
/workflow/i
]
},
technical: {
keywords: [
'architecture', 'implementation', 'api', 'code', 'technical',
'development', 'engineering', 'system', 'design pattern',
'algorithm', 'data structure', 'performance', 'optimization'
],
titlePatterns: [
/^(architecture|technical|implementation|api|code)/i,
/design$/i,
/specification$/i,
/^system/i,
/performance/i,
/optimization/i
]
},
reference: {
keywords: [
'reference', 'appendix', 'glossary', 'contact', 'resources',
'further reading', 'bibliography', 'links', 'related work',
'acknowledgment', 'citation'
],
titlePatterns: [
/^(reference|appendix|glossary|contact|resources)/i,
/further reading/i,
/related (work|resources)/i,
/^(acknowledgment|citation)/i
]
}
};
/**
* Analyze section and determine best category
*/
function categorizeSection(section, docTitle, sectionIndex, totalSections) {
const title = (section.title || '').toLowerCase();
const excerpt = (section.excerpt || '').toLowerCase();
const content = (section.content_html || '').toLowerCase();
const combined = `${title} ${excerpt}`;
const scores = {
critical: 0,
conceptual: 0,
practical: 0,
technical: 0,
reference: 0
};
// Score each category based on rules
for (const [category, rules] of Object.entries(RULES)) {
// Check exclude patterns first (for critical)
if (rules.exclude) {
const hasExclude = rules.exclude.some(pattern =>
combined.includes(pattern.toLowerCase())
);
if (hasExclude && category === 'critical') {
scores[category] = -100; // Strong penalty
continue;
}
}
// Check title patterns (strong signal)
if (rules.titlePatterns) {
const titleMatch = rules.titlePatterns.some(pattern => pattern.test(title));
if (titleMatch) {
scores[category] += 50;
}
}
// Check keywords in title (medium signal)
const titleKeywords = rules.keywords.filter(kw => title.includes(kw));
scores[category] += titleKeywords.length * 20;
// Check keywords in excerpt (weak signal)
const excerptKeywords = rules.keywords.filter(kw => excerpt.includes(kw));
scores[category] += excerptKeywords.length * 5;
}
// Position-based adjustments
if (sectionIndex === 0) {
// First section usually conceptual or critical prerequisite
if (title.includes('introduction') || title.includes('overview')) {
scores.conceptual += 30;
}
} else if (sectionIndex === totalSections - 1) {
// Last section often reference
scores.reference += 10;
}
// Document context adjustments
const docTitleLower = docTitle.toLowerCase();
if (docTitleLower.includes('case study') || docTitleLower.includes('incident')) {
// Case studies are usually conceptual, not critical
scores.conceptual += 20;
scores.critical -= 30;
}
if (docTitleLower.includes('implementation') || docTitleLower.includes('guide')) {
scores.practical += 15;
}
if (docTitleLower.includes('api') || docTitleLower.includes('technical')) {
scores.technical += 15;
}
// Find category with highest score
const sortedScores = Object.entries(scores).sort((a, b) => b[1] - a[1]);
const bestCategory = sortedScores[0][0];
const bestScore = sortedScores[0][1];
// If all scores are very low, default to conceptual
if (bestScore < 10) {
return 'conceptual';
}
return bestCategory;
}
/**
* Main function
*/
async function main() {
console.log('═══════════════════════════════════════════════════════════');
console.log(' SECTION RECATEGORIZATION');
console.log('═══════════════════════════════════════════════════════════\n');
if (dryRun) {
console.log('🔍 DRY RUN MODE - No changes will be saved\n');
}
if (specificDoc) {
console.log(`📄 Processing single document: ${specificDoc}\n`);
}
// Connect to MongoDB
console.log('📡 Connecting to MongoDB...');
const client = await MongoClient.connect('mongodb://localhost:27017/tractatus_dev');
const db = client.db();
const collection = db.collection('documents');
// Fetch documents
const filter = { visibility: 'public' };
if (specificDoc) {
filter.slug = specificDoc;
}
const docs = await collection.find(filter).sort({ order: 1 }).toArray();
console.log(`✓ Found ${docs.length} document(s)\n`);
stats.totalDocuments = docs.length;
// Process each document
for (const doc of docs) {
if (!doc.sections || doc.sections.length === 0) {
console.log(`${doc.title}: No sections (skipping)\n`);
continue;
}
console.log(`\n${'='.repeat(70)}`);
console.log(`${doc.title}`);
console.log(`${'='.repeat(70)}\n`);
const updates = [];
stats.totalSections += doc.sections.length;
doc.sections.forEach((section, index) => {
const oldCategory = section.category || 'conceptual';
const newCategory = categorizeSection(section, doc.title, index, doc.sections.length);
stats.byCategory[oldCategory].before++;
stats.byCategory[newCategory].after++;
if (oldCategory !== newCategory) {
stats.changed++;
console.log(`[${index + 1}/${doc.sections.length}] ${section.title}`);
console.log(` ${oldCategory}${newCategory}`);
updates.push({
sectionIndex: index,
oldCategory,
newCategory,
title: section.title
});
stats.changes.push({
document: doc.title,
section: section.title,
from: oldCategory,
to: newCategory
});
} else {
stats.unchanged++;
}
});
// Apply updates if not dry run
if (!dryRun && updates.length > 0) {
const updateOperations = updates.map(update => {
return {
updateOne: {
filter: { _id: doc._id },
update: {
$set: {
[`sections.${update.sectionIndex}.category`]: update.newCategory
}
}
}
};
});
await collection.bulkWrite(updateOperations);
console.log(`\n✓ Applied ${updates.length} changes to database`);
} else if (updates.length > 0) {
console.log(`\n🔍 Would apply ${updates.length} changes (dry-run)`);
} else {
console.log(`\n✓ No changes needed`);
}
}
// Summary
console.log('\n\n═══════════════════════════════════════════════════════════');
console.log(' RECATEGORIZATION SUMMARY');
console.log('═══════════════════════════════════════════════════════════\n');
console.log(`Documents processed: ${stats.totalDocuments}`);
console.log(`Total sections: ${stats.totalSections}`);
console.log(`Changed: ${stats.changed} (${Math.round(stats.changed / stats.totalSections * 100)}%)`);
console.log(`Unchanged: ${stats.unchanged} (${Math.round(stats.unchanged / stats.totalSections * 100)}%)\n`);
console.log('Category changes:');
for (const [category, counts] of Object.entries(stats.byCategory)) {
const change = counts.after - counts.before;
const changeStr = change > 0 ? `+${change}` : change.toString();
const changePercent = counts.before > 0
? Math.round((change / counts.before) * 100)
: 0;
console.log(` ${category}: ${counts.before}${counts.after} (${changeStr}, ${changePercent > 0 ? '+' : ''}${changePercent}%)`);
}
if (dryRun) {
console.log('\n🔍 DRY RUN COMPLETE - No changes saved');
console.log(' Run without --dry-run to apply changes\n');
} else {
console.log('\n✅ RECATEGORIZATION COMPLETE\n');
}
await client.close();
}
// Run
main().catch(err => {
console.error('\n❌ Fatal error:', err.message);
console.error(err.stack);
process.exit(1);
});