diff --git a/src/controllers/documents.controller.js b/src/controllers/documents.controller.js index 988f643b..73596050 100644 --- a/src/controllers/documents.controller.js +++ b/src/controllers/documents.controller.js @@ -5,6 +5,7 @@ const Document = require('../models/Document.model'); const { markdownToHtml, extractTOC } = require('../utils/markdown.util'); +const { extractAndProcessSections } = require('../utils/sections.util'); const logger = require('../utils/logger.util'); /** @@ -113,6 +114,12 @@ async function getDocument(req, res) { if (document.translations && document.translations[lang]) { const translation = document.translations[lang]; + // TEMPORARY WORKAROUND: Use English sections (markdown structure preserved) + // The DeepL translation mangled markdown formatting, so we use English structure + // but with translated title and content + // TODO: Re-translate with proper markdown preservation settings + const sections = document.sections || []; + // Return document with translated fields const translatedDoc = { ...document, @@ -120,6 +127,7 @@ async function getDocument(req, res) { content_html: translation.content_html || document.content_html, content_markdown: translation.content_markdown || document.content_markdown, toc: translation.toc || document.toc, + sections: sections, // Use English sections as workaround language: lang, translation_metadata: translation.metadata }; diff --git a/src/utils/sections.util.js b/src/utils/sections.util.js new file mode 100644 index 00000000..2befddd1 --- /dev/null +++ b/src/utils/sections.util.js @@ -0,0 +1,232 @@ +/** + * Sections Utility + * Extract and process document sections from markdown content + */ + +const { markdownToHtml } = require('./markdown.util'); + +/** + * Extract sections from markdown content + * Parses H2 headers (##) as section titles + */ +function extractSectionsFromMarkdown(markdown) { + if (!markdown) return []; + + const lines = markdown.split('\n'); + const sections = []; + let currentSection = null; + let contentBuffer = []; + + for (let i = 0; i < lines.length; i++) { + const line = lines[i]; + + // Match H2 headers (## Title) + const h2Match = line.match(/^## (.+)$/); + if (h2Match) { + // Save previous section if exists + if (currentSection) { + currentSection.content_md = contentBuffer.join('\n').trim(); + sections.push(currentSection); + } + + // Start new section + currentSection = { + title: h2Match[1].trim(), + content_md: '' + }; + contentBuffer = []; + continue; + } + + // Collect content for current section + if (currentSection) { + contentBuffer.push(line); + } + } + + // Save final section + if (currentSection) { + currentSection.content_md = contentBuffer.join('\n').trim(); + sections.push(currentSection); + } + + return sections; +} + +/** + * Generate excerpt from markdown + */ +function generateExcerpt(markdown, maxLength = 150) { + if (!markdown) return ''; + + let text = markdown + .replace(/^#+\s+/gm, '') + .replace(/\*\*(.+?)\*\*/g, '$1') + .replace(/\*(.+?)\*/g, '$1') + .replace(/\[(.+?)\]\(.+?\)/g, '$1') + .replace(/`(.+?)`/g, '$1') + .replace(/^[-*+]\s+/gm, '') + .replace(/^\d+\.\s+/gm, '') + .replace(/\n{2,}/g, ' ') + .trim(); + + if (text.length > maxLength) { + text = text.substring(0, maxLength).trim(); + const lastPeriod = text.lastIndexOf('.'); + if (lastPeriod > maxLength * 0.7) { + text = text.substring(0, lastPeriod + 1); + } else { + text += '...'; + } + } + + return text; +} + +/** + * Estimate reading time from text + */ +function estimateReadingTime(text) { + if (!text) return 1; + const wordCount = text.split(/\s+/).length; + const minutes = Math.ceil(wordCount / 200); + return Math.max(1, minutes); +} + +/** + * Classify section category + */ +function classifySection(title, content) { + const titleLower = title.toLowerCase(); + const contentLower = content.toLowerCase(); + + if ( + titleLower.includes('limitation') || + titleLower.includes('failure') || + titleLower.includes('warning') || + titleLower.includes('security') || + titleLower.includes('risk') || + content.match(/⚠️|critical|warning|caution|danger/gi) + ) { + return 'critical'; + } + + if ( + titleLower.includes('glossary') || + titleLower.includes('reference') || + titleLower.includes('contact') || + titleLower.includes('license') || + titleLower.includes('getting started') + ) { + return 'reference'; + } + + if ( + titleLower.includes('technical') || + titleLower.includes('architecture') || + titleLower.includes('implementation') || + titleLower.includes('integration') || + titleLower.includes('api') || + content.match(/```|`[a-z]+`|function|class|const|import/gi) + ) { + return 'technical'; + } + + if ( + titleLower.includes('how') || + titleLower.includes('guide') || + titleLower.includes('tutorial') || + titleLower.includes('example') || + titleLower.includes('use case') || + titleLower.includes('should use') || + titleLower.includes('contributing') + ) { + return 'practical'; + } + + return 'conceptual'; +} + +/** + * Determine technical level + */ +function determineTechnicalLevel(content) { + const contentLower = content.toLowerCase(); + + if ( + content.match(/```[\s\S]+```/g) || + contentLower.includes('api') || + contentLower.includes('implementation') || + contentLower.includes('integration') || + contentLower.includes('architecture') + ) { + return 'advanced'; + } + + if ( + contentLower.includes('service') || + contentLower.includes('component') || + contentLower.includes('system') || + contentLower.includes('framework') + ) { + return 'intermediate'; + } + + return 'beginner'; +} + +/** + * Generate slug from title + */ +function generateSlug(title) { + return title + .toLowerCase() + .replace(/[^a-z0-9]+/g, '-') + .replace(/^-+|-+$/g, ''); +} + +/** + * Process sections to add metadata + * Enriches sections with HTML content, excerpts, reading time, etc. + */ +function processSections(sections) { + return sections.map((section, index) => { + const content_html = markdownToHtml(section.content_md); + const excerpt = generateExcerpt(section.content_md); + const readingTime = estimateReadingTime(section.content_md); + const category = classifySection(section.title, section.content_md); + const technicalLevel = determineTechnicalLevel(section.content_md); + const slug = generateSlug(section.title); + + return { + ...section, + slug, + content_html, + excerpt, + readingTime, + category, + technicalLevel, + order: index + 1 + }; + }); +} + +/** + * Extract and process sections from markdown + * Complete pipeline: extract -> process -> return + */ +function extractAndProcessSections(markdown) { + const sections = extractSectionsFromMarkdown(markdown); + return processSections(sections); +} + +module.exports = { + extractSectionsFromMarkdown, + generateExcerpt, + estimateReadingTime, + classifySection, + determineTechnicalLevel, + generateSlug, + processSections, + extractAndProcessSections +};