311 lines
11 KiB
JavaScript
311 lines
11 KiB
JavaScript
/**
|
|
* Fix Glossary Structure
|
|
* - Add embedded translations (not separate documents)
|
|
* - Add sections for card view
|
|
* - Clean up separate glossary-de/glossary-fr documents
|
|
*/
|
|
|
|
require('dotenv').config();
|
|
const { MongoClient } = require('mongodb');
|
|
const fs = require('fs').promises;
|
|
const path = require('path');
|
|
const { markdownToHtml, extractTOC } = require('../src/utils/markdown.util');
|
|
|
|
// Parse frontmatter
|
|
function extractFrontmatter(content) {
|
|
const frontMatterRegex = /^---\n([\s\S]*?)\n---\n([\s\S]*)$/;
|
|
const match = content.match(frontMatterRegex);
|
|
|
|
if (!match) return { metadata: {}, content };
|
|
|
|
const frontMatterText = match[1];
|
|
const remainingContent = match[2];
|
|
|
|
const metadata = {};
|
|
frontMatterText.split('\n').forEach(line => {
|
|
const [key, ...valueParts] = line.split(':');
|
|
if (key && valueParts.length > 0) {
|
|
const value = valueParts.join(':').trim();
|
|
metadata[key.trim()] = value.replace(/^["']|["']$/g, '');
|
|
}
|
|
});
|
|
|
|
return { metadata, content: remainingContent };
|
|
}
|
|
|
|
// Generate excerpt from markdown content
|
|
function generateExcerpt(markdown) {
|
|
// Remove markdown formatting
|
|
let text = markdown
|
|
.replace(/#{1,6}\s+/g, '') // Remove headers
|
|
.replace(/\*\*(.+?)\*\*/g, '$1') // Remove bold
|
|
.replace(/\*(.+?)\*/g, '$1') // Remove italic
|
|
.replace(/\[(.+?)\]\(.+?\)/g, '$1') // Remove links
|
|
.replace(/`(.+?)`/g, '$1') // Remove code
|
|
.replace(/---/g, '') // Remove horizontal rules
|
|
.trim();
|
|
|
|
// Get first ~150 characters
|
|
if (text.length > 150) {
|
|
text = text.substring(0, 150).trim() + '...';
|
|
}
|
|
|
|
return text || 'Glossary term definition';
|
|
}
|
|
|
|
// Calculate reading time from word count
|
|
function calculateReadingTime(markdown) {
|
|
const words = markdown.split(/\s+/).length;
|
|
const minutes = Math.ceil(words / 200); // Average reading speed
|
|
return Math.max(1, minutes); // Minimum 1 minute
|
|
}
|
|
|
|
// Normalize content - add line breaks where needed
|
|
function normalizeContent(content) {
|
|
// Replace "---" horizontal rules with newlines
|
|
content = content.replace(/(\S)\s*---\s*/g, '$1\n\n---\n\n');
|
|
|
|
// Add line breaks before h2 headings
|
|
content = content.replace(/([^\n])\s+(##\s+)/g, '$1\n\n$2');
|
|
|
|
// Add line breaks before h3 headings
|
|
content = content.replace(/([^\n])\s+(###\s+)/g, '$1\n\n$2');
|
|
|
|
// For German/French: Add line breaks after "**Version:**" style metadata that comes after h1
|
|
content = content.replace(/\*\*([^*]+):\*\*\s*([^\s*]+)\s+\*\*/g, '**$1:** $2\n\n**');
|
|
|
|
// Add line breaks after closing ** before text starts
|
|
content = content.replace(/\*\*\s+([A-ZÄÖÜ])/g, '**\n\n$1');
|
|
|
|
// Fix h2 titles that have content on same line - keep only first sentence or up to 100 chars
|
|
content = content.replace(/^##\s+(.{100,}?)\.(\s+[A-ZÄÖÜ])/gm, '## $1.\n\n$2');
|
|
|
|
return content;
|
|
}
|
|
|
|
// Build sections from h2 headings
|
|
function buildSections(content, htmlContent) {
|
|
// Normalize content first to ensure h2 headings are on their own lines
|
|
content = normalizeContent(content);
|
|
|
|
const sections = [];
|
|
const lines = content.split('\n');
|
|
let currentSection = null;
|
|
let currentContent = [];
|
|
|
|
for (const line of lines) {
|
|
const h2Match = line.match(/^## (.+)$/);
|
|
|
|
if (h2Match) {
|
|
// Save previous section
|
|
if (currentSection) {
|
|
const sectionMarkdown = currentContent.join('\n');
|
|
const sectionHtml = markdownToHtml(sectionMarkdown);
|
|
|
|
currentSection.content_markdown = sectionMarkdown;
|
|
currentSection.content_html = sectionHtml;
|
|
currentSection.excerpt = generateExcerpt(sectionMarkdown);
|
|
currentSection.readingTime = calculateReadingTime(sectionMarkdown);
|
|
|
|
sections.push(currentSection);
|
|
}
|
|
|
|
// Extract title - limit to first sentence or first 10 words if too long
|
|
let fullTitle = h2Match[1].trim();
|
|
let title = fullTitle;
|
|
let titleRemainder = '';
|
|
|
|
// If title is very long (>100 chars), take only first sentence or first 10 words
|
|
if (fullTitle.length > 100) {
|
|
// Try to get first sentence
|
|
const firstSentence = fullTitle.match(/^(.{1,100}?[.!?])\s/);
|
|
if (firstSentence) {
|
|
title = firstSentence[1];
|
|
titleRemainder = fullTitle.substring(firstSentence[1].length).trim();
|
|
} else {
|
|
// Fall back to first 10 words
|
|
const words = fullTitle.split(/\s+/);
|
|
title = words.slice(0, 10).join(' ');
|
|
titleRemainder = words.slice(10).join(' ');
|
|
}
|
|
}
|
|
|
|
// Start new section
|
|
currentSection = {
|
|
title: title,
|
|
slug: title.toLowerCase()
|
|
.replace(/[^\w\s-]/g, '')
|
|
.replace(/\s+/g, '-'),
|
|
number: sections.length + 1,
|
|
category: 'term',
|
|
technicalLevel: 'basic'
|
|
};
|
|
currentContent = [];
|
|
|
|
// Add remainder to content if exists
|
|
if (titleRemainder) {
|
|
currentContent.push(titleRemainder);
|
|
}
|
|
} else if (currentSection) {
|
|
currentContent.push(line);
|
|
}
|
|
}
|
|
|
|
// Save last section
|
|
if (currentSection) {
|
|
const sectionMarkdown = currentContent.join('\n');
|
|
const sectionHtml = markdownToHtml(sectionMarkdown);
|
|
|
|
currentSection.content_markdown = sectionMarkdown;
|
|
currentSection.content_html = sectionHtml;
|
|
currentSection.excerpt = generateExcerpt(sectionMarkdown);
|
|
currentSection.readingTime = calculateReadingTime(sectionMarkdown);
|
|
|
|
sections.push(currentSection);
|
|
}
|
|
|
|
return sections;
|
|
}
|
|
|
|
async function run() {
|
|
const mongoUri = process.env.MONGODB_URI || 'mongodb://localhost:27017';
|
|
const dbName = process.env.MONGODB_DB || 'tractatus_dev';
|
|
const client = new MongoClient(mongoUri);
|
|
|
|
try {
|
|
await client.connect();
|
|
const db = client.db(dbName);
|
|
const collection = db.collection('documents');
|
|
|
|
console.log('═══════════════════════════════════════════════════════════');
|
|
console.log(' FIXING GLOSSARY STRUCTURE');
|
|
console.log('═══════════════════════════════════════════════════════════\n');
|
|
|
|
// Read markdown files
|
|
const baseDir = path.join(__dirname, '../docs/markdown');
|
|
const glossaryEN = await fs.readFile(path.join(baseDir, 'GLOSSARY.md'), 'utf8');
|
|
const glossaryDE = await fs.readFile(path.join(baseDir, 'GLOSSARY-DE.md'), 'utf8');
|
|
const glossaryFR = await fs.readFile(path.join(baseDir, 'GLOSSARY-FR.md'), 'utf8');
|
|
|
|
// Parse each file
|
|
const en = extractFrontmatter(glossaryEN);
|
|
const de = extractFrontmatter(glossaryDE);
|
|
const fr = extractFrontmatter(glossaryFR);
|
|
|
|
console.log('✓ Parsed markdown files\n');
|
|
|
|
// Build English sections
|
|
const sections = buildSections(en.content, markdownToHtml(en.content));
|
|
console.log(`✓ Built ${sections.length} sections for English\n`);
|
|
|
|
// Build German sections
|
|
const sectionsDe = buildSections(de.content, markdownToHtml(de.content));
|
|
console.log(`✓ Built ${sectionsDe.length} sections for German\n`);
|
|
|
|
// Build French sections
|
|
const sectionsFr = buildSections(fr.content, markdownToHtml(fr.content));
|
|
console.log(`✓ Built ${sectionsFr.length} sections for French\n`);
|
|
|
|
// Build translation objects with sections
|
|
const translations = {
|
|
de: {
|
|
title: de.metadata.title,
|
|
content_markdown: de.content,
|
|
content_html: markdownToHtml(de.content),
|
|
toc: extractTOC(de.content),
|
|
sections: sectionsDe,
|
|
metadata: {
|
|
translated_by: 'deepl',
|
|
translated_at: new Date(),
|
|
reviewed: false,
|
|
source_version: '1.1'
|
|
}
|
|
},
|
|
fr: {
|
|
title: fr.metadata.title,
|
|
content_markdown: fr.content,
|
|
content_html: markdownToHtml(fr.content),
|
|
toc: extractTOC(fr.content),
|
|
sections: sectionsFr,
|
|
metadata: {
|
|
translated_by: 'deepl',
|
|
translated_at: new Date(),
|
|
reviewed: false,
|
|
source_version: '1.1'
|
|
}
|
|
}
|
|
};
|
|
|
|
console.log('✓ Built translation objects with sections\n');
|
|
|
|
// Find main glossary document (could be 'glossary', 'GLOSSARY', or long slug)
|
|
const existingDoc = await collection.findOne({
|
|
$or: [
|
|
{ slug: 'glossary' },
|
|
{ slug: 'GLOSSARY' },
|
|
{ slug: /^tractatus-agentic-governance-system-glossary-of-terms$/ }
|
|
]
|
|
});
|
|
|
|
if (!existingDoc) {
|
|
console.error('✗ Could not find main glossary document');
|
|
process.exit(1);
|
|
}
|
|
|
|
console.log(`✓ Found glossary document: ${existingDoc.slug}\n`);
|
|
|
|
// Update main glossary document
|
|
const result = await collection.updateOne(
|
|
{ _id: existingDoc._id },
|
|
{
|
|
$set: {
|
|
slug: 'glossary', // Normalize to lowercase
|
|
category: 'getting-started', // Move to Getting Started section
|
|
sections: sections,
|
|
translations: translations,
|
|
content_html: markdownToHtml(en.content),
|
|
content_markdown: en.content,
|
|
toc: extractTOC(en.content),
|
|
updated_at: new Date()
|
|
}
|
|
}
|
|
);
|
|
|
|
console.log(`✓ Updated glossary document (${result.modifiedCount} modified)\n`);
|
|
|
|
// Delete separate translation documents and old duplicates
|
|
const deleteResult = await collection.deleteMany({
|
|
$and: [
|
|
{ _id: { $ne: existingDoc._id } }, // Don't delete the main one
|
|
{
|
|
$or: [
|
|
{ slug: 'glossary-de' },
|
|
{ slug: 'glossary-fr' },
|
|
{ slug: 'GLOSSARY' },
|
|
{ slug: /^tractatus-agentic-governance-system-glossary-of-terms/ }
|
|
]
|
|
}
|
|
]
|
|
});
|
|
|
|
console.log(`✓ Deleted ${deleteResult.deletedCount} duplicate/separate translation documents\n`);
|
|
|
|
console.log('═══════════════════════════════════════════════════════════');
|
|
console.log(' SUMMARY');
|
|
console.log('═══════════════════════════════════════════════════════════\n');
|
|
console.log(`English sections: ${sections.length} (with excerpt, readingTime)`);
|
|
console.log(`German sections: ${sectionsDe.length} (with excerpt, readingTime)`);
|
|
console.log(`French sections: ${sectionsFr.length} (with excerpt, readingTime)`);
|
|
console.log(`Translations embedded: 2 (de, fr)`);
|
|
console.log(`Separate docs deleted: ${deleteResult.deletedCount}`);
|
|
console.log('\n✅ Glossary structure fixed!\n');
|
|
|
|
} catch (error) {
|
|
console.error('Error:', error);
|
|
process.exit(1);
|
|
} finally {
|
|
await client.close();
|
|
}
|
|
}
|
|
|
|
run();
|