From bf187ff4115bd5125e7c36c93064637b7c8cd60a Mon Sep 17 00:00:00 2001 From: TheFlow Date: Sat, 1 Nov 2025 13:05:22 +1300 Subject: [PATCH] fix: add excerpt, readingTime, sections for all glossary translations --- scripts/fix-glossary-structure.js | 118 ++++++++++++++++++++++++++++-- 1 file changed, 110 insertions(+), 8 deletions(-) diff --git a/scripts/fix-glossary-structure.js b/scripts/fix-glossary-structure.js index 68144bb4..43b3569b 100644 --- a/scripts/fix-glossary-structure.js +++ b/scripts/fix-glossary-structure.js @@ -33,8 +33,61 @@ function extractFrontmatter(content) { return { metadata, content: remainingContent }; } +// Generate excerpt from markdown content +function generateExcerpt(markdown) { + // Remove markdown formatting + let text = markdown + .replace(/#{1,6}\s+/g, '') // Remove headers + .replace(/\*\*(.+?)\*\*/g, '$1') // Remove bold + .replace(/\*(.+?)\*/g, '$1') // Remove italic + .replace(/\[(.+?)\]\(.+?\)/g, '$1') // Remove links + .replace(/`(.+?)`/g, '$1') // Remove code + .replace(/---/g, '') // Remove horizontal rules + .trim(); + + // Get first ~150 characters + if (text.length > 150) { + text = text.substring(0, 150).trim() + '...'; + } + + return text || 'Glossary term definition'; +} + +// Calculate reading time from word count +function calculateReadingTime(markdown) { + const words = markdown.split(/\s+/).length; + const minutes = Math.ceil(words / 200); // Average reading speed + return Math.max(1, minutes); // Minimum 1 minute +} + +// Normalize content - add line breaks where needed +function normalizeContent(content) { + // Replace "---" horizontal rules with newlines + content = content.replace(/(\S)\s*---\s*/g, '$1\n\n---\n\n'); + + // Add line breaks before h2 headings + content = content.replace(/([^\n])\s+(##\s+)/g, '$1\n\n$2'); + + // Add line breaks before h3 headings + content = content.replace(/([^\n])\s+(###\s+)/g, '$1\n\n$2'); + + // For German/French: Add line breaks after "**Version:**" style metadata that comes after h1 + content = content.replace(/\*\*([^*]+):\*\*\s*([^\s*]+)\s+\*\*/g, '**$1:** $2\n\n**'); + + // Add line breaks after closing ** before text starts + content = content.replace(/\*\*\s+([A-ZÄÖÜ])/g, '**\n\n$1'); + + // Fix h2 titles that have content on same line - keep only first sentence or up to 100 chars + content = content.replace(/^##\s+(.{100,}?)\.(\s+[A-ZÄÖÜ])/gm, '## $1.\n\n$2'); + + return content; +} + // Build sections from h2 headings function buildSections(content, htmlContent) { + // Normalize content first to ensure h2 headings are on their own lines + content = normalizeContent(content); + const sections = []; const lines = content.split('\n'); let currentSection = null; @@ -47,20 +100,52 @@ function buildSections(content, htmlContent) { // Save previous section if (currentSection) { const sectionMarkdown = currentContent.join('\n'); + const sectionHtml = markdownToHtml(sectionMarkdown); + currentSection.content_markdown = sectionMarkdown; - currentSection.content_html = markdownToHtml(sectionMarkdown); + currentSection.content_html = sectionHtml; + currentSection.excerpt = generateExcerpt(sectionMarkdown); + currentSection.readingTime = calculateReadingTime(sectionMarkdown); + sections.push(currentSection); } + // Extract title - limit to first sentence or first 10 words if too long + let fullTitle = h2Match[1].trim(); + let title = fullTitle; + let titleRemainder = ''; + + // If title is very long (>100 chars), take only first sentence or first 10 words + if (fullTitle.length > 100) { + // Try to get first sentence + const firstSentence = fullTitle.match(/^(.{1,100}?[.!?])\s/); + if (firstSentence) { + title = firstSentence[1]; + titleRemainder = fullTitle.substring(firstSentence[1].length).trim(); + } else { + // Fall back to first 10 words + const words = fullTitle.split(/\s+/); + title = words.slice(0, 10).join(' '); + titleRemainder = words.slice(10).join(' '); + } + } + // Start new section currentSection = { - title: h2Match[1], - slug: h2Match[1].toLowerCase() + title: title, + slug: title.toLowerCase() .replace(/[^\w\s-]/g, '') .replace(/\s+/g, '-'), - order: sections.length + 1 + number: sections.length + 1, + category: 'term', + technicalLevel: 'basic' }; currentContent = []; + + // Add remainder to content if exists + if (titleRemainder) { + currentContent.push(titleRemainder); + } } else if (currentSection) { currentContent.push(line); } @@ -69,8 +154,13 @@ function buildSections(content, htmlContent) { // Save last section if (currentSection) { const sectionMarkdown = currentContent.join('\n'); + const sectionHtml = markdownToHtml(sectionMarkdown); + currentSection.content_markdown = sectionMarkdown; - currentSection.content_html = markdownToHtml(sectionMarkdown); + currentSection.content_html = sectionHtml; + currentSection.excerpt = generateExcerpt(sectionMarkdown); + currentSection.readingTime = calculateReadingTime(sectionMarkdown); + sections.push(currentSection); } @@ -108,13 +198,22 @@ async function run() { const sections = buildSections(en.content, markdownToHtml(en.content)); console.log(`✓ Built ${sections.length} sections for English\n`); - // Build translation objects + // Build German sections + const sectionsDe = buildSections(de.content, markdownToHtml(de.content)); + console.log(`✓ Built ${sectionsDe.length} sections for German\n`); + + // Build French sections + const sectionsFr = buildSections(fr.content, markdownToHtml(fr.content)); + console.log(`✓ Built ${sectionsFr.length} sections for French\n`); + + // Build translation objects with sections const translations = { de: { title: de.metadata.title, content_markdown: de.content, content_html: markdownToHtml(de.content), toc: extractTOC(de.content), + sections: sectionsDe, metadata: { translated_by: 'deepl', translated_at: new Date(), @@ -127,6 +226,7 @@ async function run() { content_markdown: fr.content, content_html: markdownToHtml(fr.content), toc: extractTOC(fr.content), + sections: sectionsFr, metadata: { translated_by: 'deepl', translated_at: new Date(), @@ -136,7 +236,7 @@ async function run() { } }; - console.log('✓ Built translation objects\n'); + console.log('✓ Built translation objects with sections\n'); // Find main glossary document (could be 'glossary', 'GLOSSARY', or long slug) const existingDoc = await collection.findOne({ @@ -193,7 +293,9 @@ async function run() { console.log('═══════════════════════════════════════════════════════════'); console.log(' SUMMARY'); console.log('═══════════════════════════════════════════════════════════\n'); - console.log(`Sections created: ${sections.length}`); + console.log(`English sections: ${sections.length} (with excerpt, readingTime)`); + console.log(`German sections: ${sectionsDe.length} (with excerpt, readingTime)`); + console.log(`French sections: ${sectionsFr.length} (with excerpt, readingTime)`); console.log(`Translations embedded: 2 (de, fr)`); console.log(`Separate docs deleted: ${deleteResult.deletedCount}`); console.log('\n✅ Glossary structure fixed!\n');