fix: add excerpt, readingTime, sections for all glossary translations
This commit is contained in:
parent
40488a3d7c
commit
bf187ff411
1 changed files with 110 additions and 8 deletions
|
|
@ -33,8 +33,61 @@ function extractFrontmatter(content) {
|
|||
return { metadata, content: remainingContent };
|
||||
}
|
||||
|
||||
// Generate excerpt from markdown content
|
||||
function generateExcerpt(markdown) {
|
||||
// Remove markdown formatting
|
||||
let text = markdown
|
||||
.replace(/#{1,6}\s+/g, '') // Remove headers
|
||||
.replace(/\*\*(.+?)\*\*/g, '$1') // Remove bold
|
||||
.replace(/\*(.+?)\*/g, '$1') // Remove italic
|
||||
.replace(/\[(.+?)\]\(.+?\)/g, '$1') // Remove links
|
||||
.replace(/`(.+?)`/g, '$1') // Remove code
|
||||
.replace(/---/g, '') // Remove horizontal rules
|
||||
.trim();
|
||||
|
||||
// Get first ~150 characters
|
||||
if (text.length > 150) {
|
||||
text = text.substring(0, 150).trim() + '...';
|
||||
}
|
||||
|
||||
return text || 'Glossary term definition';
|
||||
}
|
||||
|
||||
// Calculate reading time from word count
|
||||
function calculateReadingTime(markdown) {
|
||||
const words = markdown.split(/\s+/).length;
|
||||
const minutes = Math.ceil(words / 200); // Average reading speed
|
||||
return Math.max(1, minutes); // Minimum 1 minute
|
||||
}
|
||||
|
||||
// Normalize content - add line breaks where needed
|
||||
function normalizeContent(content) {
|
||||
// Replace "---" horizontal rules with newlines
|
||||
content = content.replace(/(\S)\s*---\s*/g, '$1\n\n---\n\n');
|
||||
|
||||
// Add line breaks before h2 headings
|
||||
content = content.replace(/([^\n])\s+(##\s+)/g, '$1\n\n$2');
|
||||
|
||||
// Add line breaks before h3 headings
|
||||
content = content.replace(/([^\n])\s+(###\s+)/g, '$1\n\n$2');
|
||||
|
||||
// For German/French: Add line breaks after "**Version:**" style metadata that comes after h1
|
||||
content = content.replace(/\*\*([^*]+):\*\*\s*([^\s*]+)\s+\*\*/g, '**$1:** $2\n\n**');
|
||||
|
||||
// Add line breaks after closing ** before text starts
|
||||
content = content.replace(/\*\*\s+([A-ZÄÖÜ])/g, '**\n\n$1');
|
||||
|
||||
// Fix h2 titles that have content on same line - keep only first sentence or up to 100 chars
|
||||
content = content.replace(/^##\s+(.{100,}?)\.(\s+[A-ZÄÖÜ])/gm, '## $1.\n\n$2');
|
||||
|
||||
return content;
|
||||
}
|
||||
|
||||
// Build sections from h2 headings
|
||||
function buildSections(content, htmlContent) {
|
||||
// Normalize content first to ensure h2 headings are on their own lines
|
||||
content = normalizeContent(content);
|
||||
|
||||
const sections = [];
|
||||
const lines = content.split('\n');
|
||||
let currentSection = null;
|
||||
|
|
@ -47,20 +100,52 @@ function buildSections(content, htmlContent) {
|
|||
// Save previous section
|
||||
if (currentSection) {
|
||||
const sectionMarkdown = currentContent.join('\n');
|
||||
const sectionHtml = markdownToHtml(sectionMarkdown);
|
||||
|
||||
currentSection.content_markdown = sectionMarkdown;
|
||||
currentSection.content_html = markdownToHtml(sectionMarkdown);
|
||||
currentSection.content_html = sectionHtml;
|
||||
currentSection.excerpt = generateExcerpt(sectionMarkdown);
|
||||
currentSection.readingTime = calculateReadingTime(sectionMarkdown);
|
||||
|
||||
sections.push(currentSection);
|
||||
}
|
||||
|
||||
// Extract title - limit to first sentence or first 10 words if too long
|
||||
let fullTitle = h2Match[1].trim();
|
||||
let title = fullTitle;
|
||||
let titleRemainder = '';
|
||||
|
||||
// If title is very long (>100 chars), take only first sentence or first 10 words
|
||||
if (fullTitle.length > 100) {
|
||||
// Try to get first sentence
|
||||
const firstSentence = fullTitle.match(/^(.{1,100}?[.!?])\s/);
|
||||
if (firstSentence) {
|
||||
title = firstSentence[1];
|
||||
titleRemainder = fullTitle.substring(firstSentence[1].length).trim();
|
||||
} else {
|
||||
// Fall back to first 10 words
|
||||
const words = fullTitle.split(/\s+/);
|
||||
title = words.slice(0, 10).join(' ');
|
||||
titleRemainder = words.slice(10).join(' ');
|
||||
}
|
||||
}
|
||||
|
||||
// Start new section
|
||||
currentSection = {
|
||||
title: h2Match[1],
|
||||
slug: h2Match[1].toLowerCase()
|
||||
title: title,
|
||||
slug: title.toLowerCase()
|
||||
.replace(/[^\w\s-]/g, '')
|
||||
.replace(/\s+/g, '-'),
|
||||
order: sections.length + 1
|
||||
number: sections.length + 1,
|
||||
category: 'term',
|
||||
technicalLevel: 'basic'
|
||||
};
|
||||
currentContent = [];
|
||||
|
||||
// Add remainder to content if exists
|
||||
if (titleRemainder) {
|
||||
currentContent.push(titleRemainder);
|
||||
}
|
||||
} else if (currentSection) {
|
||||
currentContent.push(line);
|
||||
}
|
||||
|
|
@ -69,8 +154,13 @@ function buildSections(content, htmlContent) {
|
|||
// Save last section
|
||||
if (currentSection) {
|
||||
const sectionMarkdown = currentContent.join('\n');
|
||||
const sectionHtml = markdownToHtml(sectionMarkdown);
|
||||
|
||||
currentSection.content_markdown = sectionMarkdown;
|
||||
currentSection.content_html = markdownToHtml(sectionMarkdown);
|
||||
currentSection.content_html = sectionHtml;
|
||||
currentSection.excerpt = generateExcerpt(sectionMarkdown);
|
||||
currentSection.readingTime = calculateReadingTime(sectionMarkdown);
|
||||
|
||||
sections.push(currentSection);
|
||||
}
|
||||
|
||||
|
|
@ -108,13 +198,22 @@ async function run() {
|
|||
const sections = buildSections(en.content, markdownToHtml(en.content));
|
||||
console.log(`✓ Built ${sections.length} sections for English\n`);
|
||||
|
||||
// Build translation objects
|
||||
// Build German sections
|
||||
const sectionsDe = buildSections(de.content, markdownToHtml(de.content));
|
||||
console.log(`✓ Built ${sectionsDe.length} sections for German\n`);
|
||||
|
||||
// Build French sections
|
||||
const sectionsFr = buildSections(fr.content, markdownToHtml(fr.content));
|
||||
console.log(`✓ Built ${sectionsFr.length} sections for French\n`);
|
||||
|
||||
// Build translation objects with sections
|
||||
const translations = {
|
||||
de: {
|
||||
title: de.metadata.title,
|
||||
content_markdown: de.content,
|
||||
content_html: markdownToHtml(de.content),
|
||||
toc: extractTOC(de.content),
|
||||
sections: sectionsDe,
|
||||
metadata: {
|
||||
translated_by: 'deepl',
|
||||
translated_at: new Date(),
|
||||
|
|
@ -127,6 +226,7 @@ async function run() {
|
|||
content_markdown: fr.content,
|
||||
content_html: markdownToHtml(fr.content),
|
||||
toc: extractTOC(fr.content),
|
||||
sections: sectionsFr,
|
||||
metadata: {
|
||||
translated_by: 'deepl',
|
||||
translated_at: new Date(),
|
||||
|
|
@ -136,7 +236,7 @@ async function run() {
|
|||
}
|
||||
};
|
||||
|
||||
console.log('✓ Built translation objects\n');
|
||||
console.log('✓ Built translation objects with sections\n');
|
||||
|
||||
// Find main glossary document (could be 'glossary', 'GLOSSARY', or long slug)
|
||||
const existingDoc = await collection.findOne({
|
||||
|
|
@ -193,7 +293,9 @@ async function run() {
|
|||
console.log('═══════════════════════════════════════════════════════════');
|
||||
console.log(' SUMMARY');
|
||||
console.log('═══════════════════════════════════════════════════════════\n');
|
||||
console.log(`Sections created: ${sections.length}`);
|
||||
console.log(`English sections: ${sections.length} (with excerpt, readingTime)`);
|
||||
console.log(`German sections: ${sectionsDe.length} (with excerpt, readingTime)`);
|
||||
console.log(`French sections: ${sectionsFr.length} (with excerpt, readingTime)`);
|
||||
console.log(`Translations embedded: 2 (de, fr)`);
|
||||
console.log(`Separate docs deleted: ${deleteResult.deletedCount}`);
|
||||
console.log('\n✅ Glossary structure fixed!\n');
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue