From bf187ff4115bd5125e7c36c93064637b7c8cd60a Mon Sep 17 00:00:00 2001
From: TheFlow <theflow@sydigital.com>
Date: Sat, 1 Nov 2025 13:05:22 +1300
Subject: [PATCH] fix: add excerpt, readingTime, sections for all glossary
 translations

---
 scripts/fix-glossary-structure.js | 118 ++++++++++++++++++++++++++++--
 1 file changed, 110 insertions(+), 8 deletions(-)

diff --git a/scripts/fix-glossary-structure.js b/scripts/fix-glossary-structure.js
index 68144bb4..43b3569b 100644
--- a/scripts/fix-glossary-structure.js
+++ b/scripts/fix-glossary-structure.js
@@ -33,8 +33,61 @@ function extractFrontmatter(content) {
   return { metadata, content: remainingContent };
 }
 
+// Generate excerpt from markdown content
+function generateExcerpt(markdown) {
+  // Remove markdown formatting
+  let text = markdown
+    .replace(/#{1,6}\s+/g, '') // Remove headers
+    .replace(/\*\*(.+?)\*\*/g, '$1') // Remove bold
+    .replace(/\*(.+?)\*/g, '$1') // Remove italic
+    .replace(/\[(.+?)\]\(.+?\)/g, '$1') // Remove links
+    .replace(/`(.+?)`/g, '$1') // Remove code
+    .replace(/---/g, '') // Remove horizontal rules
+    .trim();
+
+  // Get first ~150 characters
+  if (text.length > 150) {
+    text = text.substring(0, 150).trim() + '...';
+  }
+
+  return text || 'Glossary term definition';
+}
+
+// Calculate reading time from word count
+function calculateReadingTime(markdown) {
+  const words = markdown.split(/\s+/).length;
+  const minutes = Math.ceil(words / 200); // Average reading speed
+  return Math.max(1, minutes); // Minimum 1 minute
+}
+
+// Normalize content - add line breaks where needed
+function normalizeContent(content) {
+  // Replace "---" horizontal rules with newlines
+  content = content.replace(/(\S)\s*---\s*/g, '$1\n\n---\n\n');
+
+  // Add line breaks before h2 headings
+  content = content.replace(/([^\n])\s+(##\s+)/g, '$1\n\n$2');
+
+  // Add line breaks before h3 headings
+  content = content.replace(/([^\n])\s+(###\s+)/g, '$1\n\n$2');
+
+  // For German/French: Add line breaks after "**Version:**" style metadata that comes after h1
+  content = content.replace(/\*\*([^*]+):\*\*\s*([^\s*]+)\s+\*\*/g, '**$1:** $2\n\n**');
+
+  // Add line breaks after closing ** before text starts
+  content = content.replace(/\*\*\s+([A-ZÄÖÜ])/g, '**\n\n$1');
+
+  // Fix h2 titles that have content on same line - keep only first sentence or up to 100 chars
+  content = content.replace(/^##\s+(.{100,}?)\.(\s+[A-ZÄÖÜ])/gm, '## $1.\n\n$2');
+
+  return content;
+}
+
 // Build sections from h2 headings
 function buildSections(content, htmlContent) {
+  // Normalize content first to ensure h2 headings are on their own lines
+  content = normalizeContent(content);
+
   const sections = [];
   const lines = content.split('\n');
   let currentSection = null;
@@ -47,20 +100,52 @@ function buildSections(content, htmlContent) {
       // Save previous section
       if (currentSection) {
         const sectionMarkdown = currentContent.join('\n');
+        const sectionHtml = markdownToHtml(sectionMarkdown);
+
         currentSection.content_markdown = sectionMarkdown;
-        currentSection.content_html = markdownToHtml(sectionMarkdown);
+        currentSection.content_html = sectionHtml;
+        currentSection.excerpt = generateExcerpt(sectionMarkdown);
+        currentSection.readingTime = calculateReadingTime(sectionMarkdown);
+
         sections.push(currentSection);
       }
 
+      // Extract title - limit to first sentence or first 10 words if too long
+      let fullTitle = h2Match[1].trim();
+      let title = fullTitle;
+      let titleRemainder = '';
+
+      // If title is very long (>100 chars), take only first sentence or first 10 words
+      if (fullTitle.length > 100) {
+        // Try to get first sentence
+        const firstSentence = fullTitle.match(/^(.{1,100}?[.!?])\s/);
+        if (firstSentence) {
+          title = firstSentence[1];
+          titleRemainder = fullTitle.substring(firstSentence[1].length).trim();
+        } else {
+          // Fall back to first 10 words
+          const words = fullTitle.split(/\s+/);
+          title = words.slice(0, 10).join(' ');
+          titleRemainder = words.slice(10).join(' ');
+        }
+      }
+
       // Start new section
       currentSection = {
-        title: h2Match[1],
-        slug: h2Match[1].toLowerCase()
+        title: title,
+        slug: title.toLowerCase()
           .replace(/[^\w\s-]/g, '')
           .replace(/\s+/g, '-'),
-        order: sections.length + 1
+        number: sections.length + 1,
+        category: 'term',
+        technicalLevel: 'basic'
       };
       currentContent = [];
+
+      // Add remainder to content if exists
+      if (titleRemainder) {
+        currentContent.push(titleRemainder);
+      }
     } else if (currentSection) {
       currentContent.push(line);
     }
@@ -69,8 +154,13 @@ function buildSections(content, htmlContent) {
   // Save last section
   if (currentSection) {
     const sectionMarkdown = currentContent.join('\n');
+    const sectionHtml = markdownToHtml(sectionMarkdown);
+
     currentSection.content_markdown = sectionMarkdown;
-    currentSection.content_html = markdownToHtml(sectionMarkdown);
+    currentSection.content_html = sectionHtml;
+    currentSection.excerpt = generateExcerpt(sectionMarkdown);
+    currentSection.readingTime = calculateReadingTime(sectionMarkdown);
+
     sections.push(currentSection);
   }
 
@@ -108,13 +198,22 @@ async function run() {
     const sections = buildSections(en.content, markdownToHtml(en.content));
     console.log(`✓ Built ${sections.length} sections for English\n`);
 
-    // Build translation objects
+    // Build German sections
+    const sectionsDe = buildSections(de.content, markdownToHtml(de.content));
+    console.log(`✓ Built ${sectionsDe.length} sections for German\n`);
+
+    // Build French sections
+    const sectionsFr = buildSections(fr.content, markdownToHtml(fr.content));
+    console.log(`✓ Built ${sectionsFr.length} sections for French\n`);
+
+    // Build translation objects with sections
     const translations = {
       de: {
         title: de.metadata.title,
         content_markdown: de.content,
         content_html: markdownToHtml(de.content),
         toc: extractTOC(de.content),
+        sections: sectionsDe,
         metadata: {
           translated_by: 'deepl',
           translated_at: new Date(),
@@ -127,6 +226,7 @@ async function run() {
         content_markdown: fr.content,
         content_html: markdownToHtml(fr.content),
         toc: extractTOC(fr.content),
+        sections: sectionsFr,
         metadata: {
           translated_by: 'deepl',
           translated_at: new Date(),
@@ -136,7 +236,7 @@ async function run() {
       }
     };
 
-    console.log('✓ Built translation objects\n');
+    console.log('✓ Built translation objects with sections\n');
 
     // Find main glossary document (could be 'glossary', 'GLOSSARY', or long slug)
     const existingDoc = await collection.findOne({
@@ -193,7 +293,9 @@ async function run() {
     console.log('═══════════════════════════════════════════════════════════');
     console.log('  SUMMARY');
     console.log('═══════════════════════════════════════════════════════════\n');
-    console.log(`Sections created: ${sections.length}`);
+    console.log(`English sections: ${sections.length} (with excerpt, readingTime)`);
+    console.log(`German sections: ${sectionsDe.length} (with excerpt, readingTime)`);
+    console.log(`French sections: ${sectionsFr.length} (with excerpt, readingTime)`);
     console.log(`Translations embedded: 2 (de, fr)`);
     console.log(`Separate docs deleted: ${deleteResult.deletedCount}`);
     console.log('\n✅ Glossary structure fixed!\n');