tractatus/scripts/fix-glossary-structure.js

/**
 * Fix Glossary Structure
 * - Add embedded translations (not separate documents)
 * - Add sections for card view
 * - Clean up separate glossary-de/glossary-fr documents
 */

require('dotenv').config();
const { MongoClient } = require('mongodb');
const fs = require('fs').promises;
const path = require('path');
const { markdownToHtml, extractTOC } = require('../src/utils/markdown.util');

// Parse frontmatter
function extractFrontmatter(content) {
  const frontMatterRegex = /^---\n([\s\S]*?)\n---\n([\s\S]*)$/;
  const match = content.match(frontMatterRegex);

  if (!match) return { metadata: {}, content };

  const frontMatterText = match[1];
  const remainingContent = match[2];

  const metadata = {};
  frontMatterText.split('\n').forEach(line => {
    const [key, ...valueParts] = line.split(':');
    if (key && valueParts.length > 0) {
      const value = valueParts.join(':').trim();
      metadata[key.trim()] = value.replace(/^["']|["']$/g, '');
    }
  });

  return { metadata, content: remainingContent };
}

// Generate excerpt from markdown content
function generateExcerpt(markdown) {
  // Remove markdown formatting
  let text = markdown
    .replace(/#{1,6}\s+/g, '') // Remove headers
    .replace(/\*\*(.+?)\*\*/g, '$1') // Remove bold
    .replace(/\*(.+?)\*/g, '$1') // Remove italic
    .replace(/\[(.+?)\]\(.+?\)/g, '$1') // Remove links
    .replace(/`(.+?)`/g, '$1') // Remove code
    .replace(/---/g, '') // Remove horizontal rules
    .trim();

  // Get first ~150 characters
  if (text.length > 150) {
    text = text.substring(0, 150).trim() + '...';
  }

  return text || 'Glossary term definition';
}

// Calculate reading time from word count
function calculateReadingTime(markdown) {
  const words = markdown.split(/\s+/).length;
  const minutes = Math.ceil(words / 200); // Average reading speed
  return Math.max(1, minutes); // Minimum 1 minute
}

// Normalize content - add line breaks where needed
function normalizeContent(content) {
  // Replace "---" horizontal rules with newlines
  content = content.replace(/(\S)\s*---\s*/g, '$1\n\n---\n\n');

  // Add line breaks before h2 headings
  content = content.replace(/([^\n])\s+(##\s+)/g, '$1\n\n$2');

  // Add line breaks before h3 headings
  content = content.replace(/([^\n])\s+(###\s+)/g, '$1\n\n$2');

  // For German/French: Add line breaks after "**Version:**" style metadata that comes after h1
  content = content.replace(/\*\*([^*]+):\*\*\s*([^\s*]+)\s+\*\*/g, '**$1:** $2\n\n**');

  // Add line breaks after closing ** before text starts
  content = content.replace(/\*\*\s+([A-ZÄÖÜ])/g, '**\n\n$1');

  // Fix h2 titles that have content on same line - keep only first sentence or up to 100 chars
  content = content.replace(/^##\s+(.{100,}?)\.(\s+[A-ZÄÖÜ])/gm, '## $1.\n\n$2');

  return content;
}

// Build sections from h2 headings
function buildSections(content, htmlContent) {
  // Normalize content first to ensure h2 headings are on their own lines
  content = normalizeContent(content);

  const sections = [];
  const lines = content.split('\n');
  let currentSection = null;
  let currentContent = [];

  for (const line of lines) {
    const h2Match = line.match(/^## (.+)$/);

    if (h2Match) {
      // Save previous section
      if (currentSection) {
        const sectionMarkdown = currentContent.join('\n');
        const sectionHtml = markdownToHtml(sectionMarkdown);

        currentSection.content_markdown = sectionMarkdown;
        currentSection.content_html = sectionHtml;
        currentSection.excerpt = generateExcerpt(sectionMarkdown);
        currentSection.readingTime = calculateReadingTime(sectionMarkdown);

        sections.push(currentSection);
      }

      // Extract title - limit to first sentence or first 10 words if too long
      let fullTitle = h2Match[1].trim();
      let title = fullTitle;
      let titleRemainder = '';

      // If title is very long (>100 chars), take only first sentence or first 10 words
      if (fullTitle.length > 100) {
        // Try to get first sentence
        const firstSentence = fullTitle.match(/^(.{1,100}?[.!?])\s/);
        if (firstSentence) {
          title = firstSentence[1];
          titleRemainder = fullTitle.substring(firstSentence[1].length).trim();
        } else {
          // Fall back to first 10 words
          const words = fullTitle.split(/\s+/);
          title = words.slice(0, 10).join(' ');
          titleRemainder = words.slice(10).join(' ');
        }
      }

      // Start new section
      currentSection = {
        title: title,
        slug: title.toLowerCase()
          .replace(/[^\w\s-]/g, '')
          .replace(/\s+/g, '-'),
        number: sections.length + 1,
        category: 'term',
        technicalLevel: 'basic'
      };
      currentContent = [];

      // Add remainder to content if exists
      if (titleRemainder) {
        currentContent.push(titleRemainder);
      }
    } else if (currentSection) {
      currentContent.push(line);
    }
  }

  // Save last section
  if (currentSection) {
    const sectionMarkdown = currentContent.join('\n');
    const sectionHtml = markdownToHtml(sectionMarkdown);

    currentSection.content_markdown = sectionMarkdown;
    currentSection.content_html = sectionHtml;
    currentSection.excerpt = generateExcerpt(sectionMarkdown);
    currentSection.readingTime = calculateReadingTime(sectionMarkdown);

    sections.push(currentSection);
  }

  return sections;
}

async function run() {
  const mongoUri = process.env.MONGODB_URI || 'mongodb://localhost:27017';
  const dbName = process.env.MONGODB_DB || 'tractatus_dev';
  const client = new MongoClient(mongoUri);

  try {
    await client.connect();
    const db = client.db(dbName);
    const collection = db.collection('documents');

    console.log('═══════════════════════════════════════════════════════════');
    console.log('  FIXING GLOSSARY STRUCTURE');
    console.log('═══════════════════════════════════════════════════════════\n');

    // Read markdown files
    const baseDir = path.join(__dirname, '../docs/markdown');
    const glossaryEN = await fs.readFile(path.join(baseDir, 'GLOSSARY.md'), 'utf8');
    const glossaryDE = await fs.readFile(path.join(baseDir, 'GLOSSARY-DE.md'), 'utf8');
    const glossaryFR = await fs.readFile(path.join(baseDir, 'GLOSSARY-FR.md'), 'utf8');

    // Parse each file
    const en = extractFrontmatter(glossaryEN);
    const de = extractFrontmatter(glossaryDE);
    const fr = extractFrontmatter(glossaryFR);

    console.log('✓ Parsed markdown files\n');

    // Build English sections
    const sections = buildSections(en.content, markdownToHtml(en.content));
    console.log(`✓ Built ${sections.length} sections for English\n`);

    // Build German sections
    const sectionsDe = buildSections(de.content, markdownToHtml(de.content));
    console.log(`✓ Built ${sectionsDe.length} sections for German\n`);

    // Build French sections
    const sectionsFr = buildSections(fr.content, markdownToHtml(fr.content));
    console.log(`✓ Built ${sectionsFr.length} sections for French\n`);

    // Build translation objects with sections
    const translations = {
      de: {
        title: de.metadata.title,
        content_markdown: de.content,
        content_html: markdownToHtml(de.content),
        toc: extractTOC(de.content),
        sections: sectionsDe,
        metadata: {
          translated_by: 'deepl',
          translated_at: new Date(),
          reviewed: false,
          source_version: '1.1'
        }
      },
      fr: {
        title: fr.metadata.title,
        content_markdown: fr.content,
        content_html: markdownToHtml(fr.content),
        toc: extractTOC(fr.content),
        sections: sectionsFr,
        metadata: {
          translated_by: 'deepl',
          translated_at: new Date(),
          reviewed: false,
          source_version: '1.1'
        }
      }
    };

    console.log('✓ Built translation objects with sections\n');

    // Find main glossary document (could be 'glossary', 'GLOSSARY', or long slug)
    const existingDoc = await collection.findOne({
      $or: [
        { slug: 'glossary' },
        { slug: 'GLOSSARY' },
        { slug: /^tractatus-agentic-governance-system-glossary-of-terms$/ }
      ]
    });

    if (!existingDoc) {
      console.error('✗ Could not find main glossary document');
      process.exit(1);
    }

    console.log(`✓ Found glossary document: ${existingDoc.slug}\n`);

    // Update main glossary document
    const result = await collection.updateOne(
      { _id: existingDoc._id },
      {
        $set: {
          slug: 'glossary', // Normalize to lowercase
          category: 'getting-started', // Move to Getting Started section
          sections: sections,
          translations: translations,
          content_html: markdownToHtml(en.content),
          content_markdown: en.content,
          toc: extractTOC(en.content),
          updated_at: new Date()
        }
      }
    );

    console.log(`✓ Updated glossary document (${result.modifiedCount} modified)\n`);

    // Delete separate translation documents and old duplicates
    const deleteResult = await collection.deleteMany({
      $and: [
        { _id: { $ne: existingDoc._id } }, // Don't delete the main one
        {
          $or: [
            { slug: 'glossary-de' },
            { slug: 'glossary-fr' },
            { slug: 'GLOSSARY' },
            { slug: /^tractatus-agentic-governance-system-glossary-of-terms/ }
          ]
        }
      ]
    });

    console.log(`✓ Deleted ${deleteResult.deletedCount} duplicate/separate translation documents\n`);

    console.log('═══════════════════════════════════════════════════════════');
    console.log('  SUMMARY');
    console.log('═══════════════════════════════════════════════════════════\n');
    console.log(`English sections: ${sections.length} (with excerpt, readingTime)`);
    console.log(`German sections: ${sectionsDe.length} (with excerpt, readingTime)`);
    console.log(`French sections: ${sectionsFr.length} (with excerpt, readingTime)`);
    console.log(`Translations embedded: 2 (de, fr)`);
    console.log(`Separate docs deleted: ${deleteResult.deletedCount}`);
    console.log('\n✅ Glossary structure fixed!\n');

  } catch (error) {
    console.error('Error:', error);
    process.exit(1);
  } finally {
    await client.close();
  }
}

run();