/** * Fix Glossary Structure * - Add embedded translations (not separate documents) * - Add sections for card view * - Clean up separate glossary-de/glossary-fr documents */ require('dotenv').config(); const { MongoClient } = require('mongodb'); const fs = require('fs').promises; const path = require('path'); const { markdownToHtml, extractTOC } = require('../src/utils/markdown.util'); // Parse frontmatter function extractFrontmatter(content) { const frontMatterRegex = /^---\n([\s\S]*?)\n---\n([\s\S]*)$/; const match = content.match(frontMatterRegex); if (!match) return { metadata: {}, content }; const frontMatterText = match[1]; const remainingContent = match[2]; const metadata = {}; frontMatterText.split('\n').forEach(line => { const [key, ...valueParts] = line.split(':'); if (key && valueParts.length > 0) { const value = valueParts.join(':').trim(); metadata[key.trim()] = value.replace(/^["']|["']$/g, ''); } }); return { metadata, content: remainingContent }; } // Generate excerpt from markdown content function generateExcerpt(markdown) { // Remove markdown formatting let text = markdown .replace(/#{1,6}\s+/g, '') // Remove headers .replace(/\*\*(.+?)\*\*/g, '$1') // Remove bold .replace(/\*(.+?)\*/g, '$1') // Remove italic .replace(/\[(.+?)\]\(.+?\)/g, '$1') // Remove links .replace(/`(.+?)`/g, '$1') // Remove code .replace(/---/g, '') // Remove horizontal rules .trim(); // Get first ~150 characters if (text.length > 150) { text = text.substring(0, 150).trim() + '...'; } return text || 'Glossary term definition'; } // Calculate reading time from word count function calculateReadingTime(markdown) { const words = markdown.split(/\s+/).length; const minutes = Math.ceil(words / 200); // Average reading speed return Math.max(1, minutes); // Minimum 1 minute } // Normalize content - add line breaks where needed function normalizeContent(content) { // Replace "---" horizontal rules with newlines content = content.replace(/(\S)\s*---\s*/g, '$1\n\n---\n\n'); // Add line breaks before h2 headings content = content.replace(/([^\n])\s+(##\s+)/g, '$1\n\n$2'); // Add line breaks before h3 headings content = content.replace(/([^\n])\s+(###\s+)/g, '$1\n\n$2'); // For German/French: Add line breaks after "**Version:**" style metadata that comes after h1 content = content.replace(/\*\*([^*]+):\*\*\s*([^\s*]+)\s+\*\*/g, '**$1:** $2\n\n**'); // Add line breaks after closing ** before text starts content = content.replace(/\*\*\s+([A-ZÄÖÜ])/g, '**\n\n$1'); // Fix h2 titles that have content on same line - keep only first sentence or up to 100 chars content = content.replace(/^##\s+(.{100,}?)\.(\s+[A-ZÄÖÜ])/gm, '## $1.\n\n$2'); return content; } // Build sections from h2 headings function buildSections(content, htmlContent) { // Normalize content first to ensure h2 headings are on their own lines content = normalizeContent(content); const sections = []; const lines = content.split('\n'); let currentSection = null; let currentContent = []; for (const line of lines) { const h2Match = line.match(/^## (.+)$/); if (h2Match) { // Save previous section if (currentSection) { const sectionMarkdown = currentContent.join('\n'); const sectionHtml = markdownToHtml(sectionMarkdown); currentSection.content_markdown = sectionMarkdown; currentSection.content_html = sectionHtml; currentSection.excerpt = generateExcerpt(sectionMarkdown); currentSection.readingTime = calculateReadingTime(sectionMarkdown); sections.push(currentSection); } // Extract title - limit to first sentence or first 10 words if too long let fullTitle = h2Match[1].trim(); let title = fullTitle; let titleRemainder = ''; // If title is very long (>100 chars), take only first sentence or first 10 words if (fullTitle.length > 100) { // Try to get first sentence const firstSentence = fullTitle.match(/^(.{1,100}?[.!?])\s/); if (firstSentence) { title = firstSentence[1]; titleRemainder = fullTitle.substring(firstSentence[1].length).trim(); } else { // Fall back to first 10 words const words = fullTitle.split(/\s+/); title = words.slice(0, 10).join(' '); titleRemainder = words.slice(10).join(' '); } } // Start new section currentSection = { title: title, slug: title.toLowerCase() .replace(/[^\w\s-]/g, '') .replace(/\s+/g, '-'), number: sections.length + 1, category: 'term', technicalLevel: 'basic' }; currentContent = []; // Add remainder to content if exists if (titleRemainder) { currentContent.push(titleRemainder); } } else if (currentSection) { currentContent.push(line); } } // Save last section if (currentSection) { const sectionMarkdown = currentContent.join('\n'); const sectionHtml = markdownToHtml(sectionMarkdown); currentSection.content_markdown = sectionMarkdown; currentSection.content_html = sectionHtml; currentSection.excerpt = generateExcerpt(sectionMarkdown); currentSection.readingTime = calculateReadingTime(sectionMarkdown); sections.push(currentSection); } return sections; } async function run() { const mongoUri = process.env.MONGODB_URI || 'mongodb://localhost:27017'; const dbName = process.env.MONGODB_DB || 'tractatus_dev'; const client = new MongoClient(mongoUri); try { await client.connect(); const db = client.db(dbName); const collection = db.collection('documents'); console.log('═══════════════════════════════════════════════════════════'); console.log(' FIXING GLOSSARY STRUCTURE'); console.log('═══════════════════════════════════════════════════════════\n'); // Read markdown files const baseDir = path.join(__dirname, '../docs/markdown'); const glossaryEN = await fs.readFile(path.join(baseDir, 'GLOSSARY.md'), 'utf8'); const glossaryDE = await fs.readFile(path.join(baseDir, 'GLOSSARY-DE.md'), 'utf8'); const glossaryFR = await fs.readFile(path.join(baseDir, 'GLOSSARY-FR.md'), 'utf8'); // Parse each file const en = extractFrontmatter(glossaryEN); const de = extractFrontmatter(glossaryDE); const fr = extractFrontmatter(glossaryFR); console.log('✓ Parsed markdown files\n'); // Build English sections const sections = buildSections(en.content, markdownToHtml(en.content)); console.log(`✓ Built ${sections.length} sections for English\n`); // Build German sections const sectionsDe = buildSections(de.content, markdownToHtml(de.content)); console.log(`✓ Built ${sectionsDe.length} sections for German\n`); // Build French sections const sectionsFr = buildSections(fr.content, markdownToHtml(fr.content)); console.log(`✓ Built ${sectionsFr.length} sections for French\n`); // Build translation objects with sections const translations = { de: { title: de.metadata.title, content_markdown: de.content, content_html: markdownToHtml(de.content), toc: extractTOC(de.content), sections: sectionsDe, metadata: { translated_by: 'deepl', translated_at: new Date(), reviewed: false, source_version: '1.1' } }, fr: { title: fr.metadata.title, content_markdown: fr.content, content_html: markdownToHtml(fr.content), toc: extractTOC(fr.content), sections: sectionsFr, metadata: { translated_by: 'deepl', translated_at: new Date(), reviewed: false, source_version: '1.1' } } }; console.log('✓ Built translation objects with sections\n'); // Find main glossary document (could be 'glossary', 'GLOSSARY', or long slug) const existingDoc = await collection.findOne({ $or: [ { slug: 'glossary' }, { slug: 'GLOSSARY' }, { slug: /^tractatus-agentic-governance-system-glossary-of-terms$/ } ] }); if (!existingDoc) { console.error('✗ Could not find main glossary document'); process.exit(1); } console.log(`✓ Found glossary document: ${existingDoc.slug}\n`); // Update main glossary document const result = await collection.updateOne( { _id: existingDoc._id }, { $set: { slug: 'glossary', // Normalize to lowercase category: 'getting-started', // Move to Getting Started section sections: sections, translations: translations, content_html: markdownToHtml(en.content), content_markdown: en.content, toc: extractTOC(en.content), updated_at: new Date() } } ); console.log(`✓ Updated glossary document (${result.modifiedCount} modified)\n`); // Delete separate translation documents and old duplicates const deleteResult = await collection.deleteMany({ $and: [ { _id: { $ne: existingDoc._id } }, // Don't delete the main one { $or: [ { slug: 'glossary-de' }, { slug: 'glossary-fr' }, { slug: 'GLOSSARY' }, { slug: /^tractatus-agentic-governance-system-glossary-of-terms/ } ] } ] }); console.log(`✓ Deleted ${deleteResult.deletedCount} duplicate/separate translation documents\n`); console.log('═══════════════════════════════════════════════════════════'); console.log(' SUMMARY'); console.log('═══════════════════════════════════════════════════════════\n'); console.log(`English sections: ${sections.length} (with excerpt, readingTime)`); console.log(`German sections: ${sectionsDe.length} (with excerpt, readingTime)`); console.log(`French sections: ${sectionsFr.length} (with excerpt, readingTime)`); console.log(`Translations embedded: 2 (de, fr)`); console.log(`Separate docs deleted: ${deleteResult.deletedCount}`); console.log('\n✅ Glossary structure fixed!\n'); } catch (error) { console.error('Error:', error); process.exit(1); } finally { await client.close(); } } run();