tractatus/src/utils/document-section-parser.js

/**
 * Document Section Parser
 * Analyzes markdown documents and creates card-based sections
 */

/**
 * Parse document into sections based on H2 headings
 */
function parseDocumentSections(markdown, contentHtml) {
  if (!markdown) return [];

  const sections = [];
  const lines = markdown.split('\n');
  let currentSection = null;
  let sectionContent = [];

  // Find H1 (document title) first
  let documentTitle = '';
  for (let i = 0; i < lines.length; i++) {
    const line = lines[i];
    const h1Match = line.match(/^#\s+(.+)$/);
    if (h1Match) {
      documentTitle = h1Match[1].trim();
      break;
    }
  }

  // Parse sections by H2 headings
  for (let i = 0; i < lines.length; i++) {
    const line = lines[i];

    // Check for H2 heading (## Heading)
    const h2Match = line.match(/^##\s+(.+)$/);

    if (h2Match) {
      // Save previous section if exists
      if (currentSection) {
        currentSection.content = sectionContent.join('\n').trim();
        currentSection.excerpt = extractExcerpt(currentSection.content);
        currentSection.readingTime = estimateReadingTime(currentSection.content);
        currentSection.technicalLevel = detectTechnicalLevel(currentSection.content);
        currentSection.category = categorizeSection(currentSection.title, currentSection.content);
        sections.push(currentSection);
      }

      // Start new section
      const title = h2Match[1].trim();
      const slug = generateSlug(title);

      currentSection = {
        title,
        slug,
        level: 2,
        content: '',
        excerpt: '',
        readingTime: 0,
        technicalLevel: 'basic',
        category: 'conceptual'
      };

      // Include the H2 heading itself in the section content
      sectionContent = [line];
    } else if (currentSection) {
      // Only add content until we hit another H2 or H1
      const isH1 = line.match(/^#\s+[^#]/);

      if (isH1) {
        // Skip H1 (document title) - don't add to section
        continue;
      }

      // Add all other content (including H3, H4, paragraphs, etc.)
      sectionContent.push(line);
    }
  }

  // Save last section
  if (currentSection && sectionContent.length > 0) {
    currentSection.content = sectionContent.join('\n').trim();
    currentSection.excerpt = extractExcerpt(currentSection.content);
    currentSection.readingTime = estimateReadingTime(currentSection.content);
    currentSection.technicalLevel = detectTechnicalLevel(currentSection.content);
    currentSection.category = categorizeSection(currentSection.title, currentSection.content);
    sections.push(currentSection);
  }

  return sections;
}

/**
 * Extract excerpt from content (first 2-3 sentences, max 150 chars)
 */
function extractExcerpt(content) {
  if (!content) return '';

  // Remove markdown formatting
  const text = content
    .replace(/^#+\s+/gm, '') // Remove headings
    .replace(/\*\*(.+?)\*\*/g, '$1') // Remove bold
    .replace(/\*(.+?)\*/g, '$1') // Remove italic
    .replace(/`(.+?)`/g, '$1') // Remove code
    .replace(/\[(.+?)\]\(.+?\)/g, '$1') // Remove links
    .replace(/^[-*]\s+/gm, '') // Remove list markers
    .replace(/^\d+\.\s+/gm, '') // Remove numbered lists
    .replace(/^>\s+/gm, '') // Remove blockquotes
    .replace(/\n+/g, ' ') // Collapse newlines
    .trim();

  // Get first 2-3 sentences
  const sentences = text.match(/[^.!?]+[.!?]+/g) || [text];
  let excerpt = sentences.slice(0, 2).join(' ');

  // Truncate to 150 chars if needed
  if (excerpt.length > 150) {
    excerpt = `${excerpt.substring(0, 147)  }...`;
  }

  return excerpt;
}

/**
 * Estimate reading time in minutes (avg 200 words/min)
 */
function estimateReadingTime(content) {
  if (!content) return 1;

  const words = content.split(/\s+/).length;
  const minutes = Math.ceil(words / 200);

  return Math.max(1, minutes);
}

/**
 * Detect technical level based on content
 */
function detectTechnicalLevel(content) {
  if (!content) return 'basic';

  const lowerContent = content.toLowerCase();

  // Technical indicators
  const technicalTerms = [
    'api', 'database', 'mongodb', 'algorithm', 'architecture',
    'implementation', 'node.js', 'javascript', 'typescript',
    'async', 'await', 'promise', 'class', 'function',
    'middleware', 'authentication', 'authorization', 'encryption',
    'hash', 'token', 'jwt', 'rest', 'graphql'
  ];

  const advancedTerms = [
    'metacognitive', 'stochastic', 'quadrant classification',
    'intersection observer', 'csp', 'security policy',
    'cross-reference validation', 'boundary enforcement',
    'architectural constraints', 'formal verification'
  ];

  let technicalScore = 0;
  let advancedScore = 0;

  // Count technical terms
  technicalTerms.forEach(term => {
    const regex = new RegExp(`\\b${term}\\b`, 'gi');
    const matches = lowerContent.match(regex);
    if (matches) technicalScore += matches.length;
  });

  // Count advanced terms
  advancedTerms.forEach(term => {
    const regex = new RegExp(`\\b${term}\\b`, 'gi');
    const matches = lowerContent.match(regex);
    if (matches) advancedScore += matches.length;
  });

  // Check for code blocks
  const codeBlocks = (content.match(/```/g) || []).length / 2;
  technicalScore += codeBlocks * 3;

  // Determine level
  if (advancedScore >= 3 || technicalScore >= 15) {
    return 'advanced';
  } else if (technicalScore >= 5) {
    return 'intermediate';
  } else {
    return 'basic';
  }
}

/**
 * Categorize section based on title and content
 */
function categorizeSection(title, content) {
  const lowerTitle = title.toLowerCase();
  const lowerContent = content.toLowerCase();

  // Category keywords
  const categories = {
    conceptual: [
      'what is', 'introduction', 'overview', 'why', 'philosophy',
      'concept', 'theory', 'principle', 'background', 'motivation'
    ],
    technical: [
      'architecture', 'implementation', 'technical', 'code', 'api',
      'configuration', 'setup', 'installation', 'integration',
      'class', 'function', 'service', 'component'
    ],
    practical: [
      'quick start', 'tutorial', 'guide', 'how to', 'example',
      'walkthrough', 'getting started', 'usage', 'practice'
    ],
    reference: [
      'reference', 'api', 'specification', 'documentation',
      'glossary', 'terms', 'definitions', 'index'
    ],
    critical: [
      'security', 'warning', 'important', 'critical', 'boundary',
      'safety', 'risk', 'violation', 'error', 'failure'
    ]
  };

  // Check title first (higher weight)
  for (const [category, keywords] of Object.entries(categories)) {
    for (const keyword of keywords) {
      if (lowerTitle.includes(keyword)) {
        return category;
      }
    }
  }

  // Check content (lower weight)
  const contentScores = {};
  for (const [category, keywords] of Object.entries(categories)) {
    contentScores[category] = 0;
    for (const keyword of keywords) {
      const regex = new RegExp(`\\b${keyword}\\b`, 'gi');
      const matches = lowerContent.match(regex);
      if (matches) contentScores[category] += matches.length;
    }
  }

  // Return category with highest score
  const maxCategory = Object.keys(contentScores).reduce((a, b) =>
    contentScores[a] > contentScores[b] ? a : b
  );

  return contentScores[maxCategory] > 0 ? maxCategory : 'conceptual';
}

/**
 * Generate URL-safe slug from title
 */
function generateSlug(title) {
  return title
    .toLowerCase()
    .replace(/[^\w\s-]/g, '')
    .replace(/\s+/g, '-')
    .replace(/-+/g, '-')
    .trim();
}

module.exports = {
  parseDocumentSections,
  extractExcerpt,
  estimateReadingTime,
  detectTechnicalLevel,
  categorizeSection,
  generateSlug
};