#!/usr/bin/env node /** * Document Migration Script * Migrates markdown documents into the MongoDB database * * Usage: * npm run migrate:docs # Interactive mode * node scripts/migrate-documents.js --source /path/to/docs --dry-run * node scripts/migrate-documents.js --source /path/to/docs --force */ require('dotenv').config(); const fs = require('fs').promises; const path = require('path'); const { connect, close } = require('../src/utils/db.util'); const Document = require('../src/models/Document.model'); const { markdownToHtml, extractTOC, generateSlug } = require('../src/utils/markdown.util'); const logger = require('../src/utils/logger.util'); // Parse command line arguments const args = process.argv.slice(2); const sourceArg = args.indexOf('--source'); const dryRun = args.includes('--dry-run'); const force = args.includes('--force'); // Default source paths const DEFAULT_SOURCES = [ '/home/theflow/projects/tractatus/docs/markdown', '/home/theflow/projects/sydigital/stochastic/innovation-exploration/anthropic-submission' ]; /** * Extract front matter from markdown */ function extractFrontMatter(content) { const frontMatterRegex = /^---\n([\s\S]*?)\n---\n([\s\S]*)$/; const match = content.match(frontMatterRegex); if (!match) { return { frontMatter: {}, content }; } const frontMatterText = match[1]; const remainingContent = match[2]; // Parse YAML-like front matter const frontMatter = {}; frontMatterText.split('\n').forEach(line => { const [key, ...valueParts] = line.split(':'); if (key && valueParts.length > 0) { const value = valueParts.join(':').trim(); frontMatter[key.trim()] = value.replace(/^["']|["']$/g, ''); // Remove quotes } }); return { frontMatter, content: remainingContent }; } /** * Extract metadata from filename and content */ function extractMetadata(filename, content, frontMatter) { // Try to extract document identifier from filename // Patterns: TRA-VAL-0001, STO-INN-0010, etc. const identifierMatch = filename.match(/([A-Z]{3}-[A-Z]{3}-\d{4})/); const identifier = identifierMatch ? identifierMatch[1] : null; // Extract quadrant from identifier let quadrant = null; if (identifier) { const [quad] = identifier.split('-'); const quadrantMap = { 'STR': 'strategic', 'OPS': 'operational', 'TAC': 'tactical', 'SYS': 'system', 'STO': 'stochastic' }; quadrant = quadrantMap[quad] || null; } // Extract title from first H1 or front matter let title = frontMatter.title || null; if (!title) { const h1Match = content.match(/^#\s+(.+)$/m); title = h1Match ? h1Match[1] : path.basename(filename, '.md'); } // Extract version from identifier or front matter let version = frontMatter.version || '1.0'; if (identifier && identifier.match(/v(\d+-\d+)/)) { version = identifier.match(/v(\d+-\d+)/)[1].replace('-', '.'); } // Determine document type let type = frontMatter.type || 'governance'; if (filename.includes('technical-proposal')) type = 'technical'; else if (filename.includes('appendix')) type = 'technical'; else if (filename.includes('framework')) type = 'framework'; else if (filename.includes('whitepaper')) type = 'research'; else if (filename.includes('case-stud')) type = 'case-study'; // Extract author const author = frontMatter.author || 'System'; // Extract tags const tags = frontMatter.tags ? frontMatter.tags.split(',').map(t => t.trim()) : []; return { identifier, title, type, quadrant, version, author, tags, status: 'published' }; } /** * Process a single markdown file */ async function processMarkdownFile(filePath, sourcePath) { const filename = path.basename(filePath); const rawContent = await fs.readFile(filePath, 'utf-8'); // Extract front matter const { frontMatter, content } = extractFrontMatter(rawContent); // Extract metadata const metadata = extractMetadata(filename, content, frontMatter); // Convert to HTML const htmlContent = markdownToHtml(content); // Extract table of contents const tableOfContents = extractTOC(content); // Use slug from frontmatter if present, otherwise generate from title const slug = metadata.slug || generateSlug(metadata.title); // Determine if document should be public // Internal document patterns (should NOT be public) const internalPatterns = [ 'session-handoff', 'phase-2', 'phase-3', 'testing', 'progress-report', 'blog-post-outlines', 'cost-estimates', 'deployment-guide', 'kickoff-checklist', 'preparation-advisory', 'soft-launch', 'implementation-session', 'test-suite' ]; // Check if filename or slug matches internal patterns const isInternal = internalPatterns.some(pattern => filename.toLowerCase().includes(pattern) || slug.toLowerCase().includes(pattern) ); // Determine visibility from front matter or default based on patterns let visibility = frontMatter.visibility || null; if (!visibility) { // Check for legacy public field if (frontMatter.public !== undefined) { const isPublic = frontMatter.public === true || frontMatter.public === 'true'; visibility = isPublic ? 'public' : 'internal'; } else { // Default to internal if matches internal patterns, otherwise public visibility = isInternal ? 'internal' : 'public'; } } // Validate visibility value const validVisibility = ['public', 'internal', 'confidential', 'archived']; if (!validVisibility.includes(visibility)) { console.warn(`Invalid visibility '${visibility}' for ${filename}, defaulting to 'internal'`); visibility = 'internal'; } // Determine category from front matter or metadata type const category = frontMatter.category || metadata.type || 'none'; // Build document object matching Document model schema const doc = { title: metadata.title, slug: slug, quadrant: metadata.quadrant, persistence: 'HIGH', // Default for technical documents visibility: visibility, category: category, content_html: htmlContent, content_markdown: content, toc: tableOfContents, metadata: { author: metadata.author, version: metadata.version, document_code: metadata.identifier, tags: metadata.tags, original_filename: filename, source_path: path.relative(sourcePath, filePath), migrated_at: new Date() }, search_index: content.toLowerCase(), translations: {}, download_formats: {} }; return doc; } /** * Find all markdown files in directory */ async function findMarkdownFiles(dirPath) { const files = []; async function scan(dir) { const entries = await fs.readdir(dir, { withFileTypes: true }); for (const entry of entries) { const fullPath = path.join(dir, entry.name); if (entry.isDirectory()) { // Skip node_modules and hidden directories if (!entry.name.startsWith('.') && entry.name !== 'node_modules') { await scan(fullPath); } } else if (entry.isFile() && entry.name.endsWith('.md')) { // Skip README files if (!entry.name.toLowerCase().includes('readme')) { files.push(fullPath); } } } } await scan(dirPath); return files; } /** * Main migration function */ async function migrate() { try { console.log('\n=== Tractatus Document Migration ===\n'); // Determine source path let sourcePath; if (sourceArg !== -1 && args[sourceArg + 1]) { sourcePath = args[sourceArg + 1]; } else { // Check default sources for (const defaultPath of DEFAULT_SOURCES) { try { const stat = await fs.stat(defaultPath); if (stat.isDirectory()) { const files = await fs.readdir(defaultPath); if (files.length > 0) { sourcePath = defaultPath; break; } } } catch (err) { // Path doesn't exist, try next } } } if (!sourcePath) { console.error('āŒ No source path specified and no documents found in default locations.'); console.log('\nUsage: npm run migrate:docs -- --source /path/to/docs'); console.log('\nDefault locations checked:'); DEFAULT_SOURCES.forEach(p => console.log(` - ${p}`)); process.exit(1); } console.log(`šŸ“‚ Source: ${sourcePath}`); console.log(`šŸ” Mode: ${dryRun ? 'DRY RUN (no changes)' : 'MIGRATION (will write to database)'}`); console.log(''); // Find markdown files const markdownFiles = await findMarkdownFiles(sourcePath); if (markdownFiles.length === 0) { console.log('āš ļø No markdown files found.'); process.exit(0); } console.log(`Found ${markdownFiles.length} markdown file(s):\n`); markdownFiles.forEach((file, i) => { console.log(` ${i + 1}. ${path.relative(sourcePath, file)}`); }); console.log(''); if (!dryRun) { // Connect to database await connect(); } // Process each file let createdCount = 0; let updatedCount = 0; let skippedCount = 0; let errorsCount = 0; for (const filePath of markdownFiles) { try { const doc = await processMarkdownFile(filePath, sourcePath); const filename = path.basename(filePath); if (dryRun) { console.log(`āœ“ [DRY RUN] ${filename}`); console.log(` Title: ${doc.title}`); console.log(` Slug: ${doc.slug}`); console.log(` Quadrant: ${doc.quadrant || 'none'}`); console.log(` Code: ${doc.metadata.document_code || 'none'}`); console.log(''); createdCount++; } else { // Check if document already exists by slug const existing = await Document.findBySlug(doc.slug); if (existing && !force) { console.log(`⊘ SKIPPED ${filename} (already exists: ${existing.slug})`); skippedCount++; } else if (existing && force) { // Update existing document const updatedDoc = await Document.update(existing._id, doc); console.log(`↻ UPDATED ${filename} (${updatedDoc.slug})`); updatedCount++; } else { // Create new document const createdDoc = await Document.create(doc); console.log(`āœ“ CREATED ${filename} (${createdDoc.slug})`); createdCount++; } } } catch (error) { console.error(`āœ— ERROR processing ${path.basename(filePath)}: ${error.message}`); logger.error(`Migration error for ${filePath}:`, error); errorsCount++; } } // Summary console.log('\n=== Migration Summary ===\n'); console.log(` Total files: ${markdownFiles.length}`); console.log(` Created: ${createdCount}`); console.log(` Updated: ${updatedCount}`); console.log(` Skipped: ${skippedCount}`); console.log(` Errors: ${errorsCount}`); console.log(''); if (dryRun) { console.log('šŸ’” This was a dry run. No changes were made.'); console.log(' Run without --dry-run to perform actual migration.'); } if (!dryRun) { logger.info(`Document migration completed: ${createdCount} created, ${updatedCount} updated, ${skippedCount} skipped, ${errorsCount} errors`); } } catch (error) { console.error('\nāŒ Migration failed:', error.message); logger.error('Migration error:', error); process.exit(1); } finally { if (!dryRun) { await close(); } } } // Run if called directly if (require.main === module) { migrate(); } module.exports = migrate;