tractatus/scripts/migrate-documents.js
TheFlow 87391ab1f2 fix: use slug from frontmatter if present in document migration
- Previously the migration script always generated slugs from titles
- Now checks metadata.slug first before auto-generating
- Allows explicit slug control via frontmatter (e.g., glossary-de, glossary-fr)
- Required for language-specific document API fallback to work correctly

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-01 10:53:05 +13:00

392 lines
12 KiB
JavaScript
Executable file

#!/usr/bin/env node
/**
* Document Migration Script
* Migrates markdown documents into the MongoDB database
*
* Usage:
* npm run migrate:docs # Interactive mode
* node scripts/migrate-documents.js --source /path/to/docs --dry-run
* node scripts/migrate-documents.js --source /path/to/docs --force
*/
require('dotenv').config();
const fs = require('fs').promises;
const path = require('path');
const { connect, close } = require('../src/utils/db.util');
const Document = require('../src/models/Document.model');
const { markdownToHtml, extractTOC, generateSlug } = require('../src/utils/markdown.util');
const logger = require('../src/utils/logger.util');
// Parse command line arguments
const args = process.argv.slice(2);
const sourceArg = args.indexOf('--source');
const dryRun = args.includes('--dry-run');
const force = args.includes('--force');
// Default source paths
const DEFAULT_SOURCES = [
'/home/theflow/projects/tractatus/docs/markdown',
'/home/theflow/projects/sydigital/stochastic/innovation-exploration/anthropic-submission'
];
/**
* Extract front matter from markdown
*/
function extractFrontMatter(content) {
const frontMatterRegex = /^---\n([\s\S]*?)\n---\n([\s\S]*)$/;
const match = content.match(frontMatterRegex);
if (!match) {
return { frontMatter: {}, content };
}
const frontMatterText = match[1];
const remainingContent = match[2];
// Parse YAML-like front matter
const frontMatter = {};
frontMatterText.split('\n').forEach(line => {
const [key, ...valueParts] = line.split(':');
if (key && valueParts.length > 0) {
const value = valueParts.join(':').trim();
frontMatter[key.trim()] = value.replace(/^["']|["']$/g, ''); // Remove quotes
}
});
return { frontMatter, content: remainingContent };
}
/**
* Extract metadata from filename and content
*/
function extractMetadata(filename, content, frontMatter) {
// Try to extract document identifier from filename
// Patterns: TRA-VAL-0001, STO-INN-0010, etc.
const identifierMatch = filename.match(/([A-Z]{3}-[A-Z]{3}-\d{4})/);
const identifier = identifierMatch ? identifierMatch[1] : null;
// Extract quadrant from identifier
let quadrant = null;
if (identifier) {
const [quad] = identifier.split('-');
const quadrantMap = {
'STR': 'strategic',
'OPS': 'operational',
'TAC': 'tactical',
'SYS': 'system',
'STO': 'stochastic'
};
quadrant = quadrantMap[quad] || null;
}
// Extract title from first H1 or front matter
let title = frontMatter.title || null;
if (!title) {
const h1Match = content.match(/^#\s+(.+)$/m);
title = h1Match ? h1Match[1] : path.basename(filename, '.md');
}
// Extract version from identifier or front matter
let version = frontMatter.version || '1.0';
if (identifier && identifier.match(/v(\d+-\d+)/)) {
version = identifier.match(/v(\d+-\d+)/)[1].replace('-', '.');
}
// Determine document type
let type = frontMatter.type || 'governance';
if (filename.includes('technical-proposal')) type = 'technical';
else if (filename.includes('appendix')) type = 'technical';
else if (filename.includes('framework')) type = 'framework';
else if (filename.includes('whitepaper')) type = 'research';
else if (filename.includes('case-stud')) type = 'case-study';
// Extract author
const author = frontMatter.author || 'System';
// Extract tags
const tags = frontMatter.tags
? frontMatter.tags.split(',').map(t => t.trim())
: [];
return {
identifier,
title,
type,
quadrant,
version,
author,
tags,
status: 'published'
};
}
/**
* Process a single markdown file
*/
async function processMarkdownFile(filePath, sourcePath) {
const filename = path.basename(filePath);
const rawContent = await fs.readFile(filePath, 'utf-8');
// Extract front matter
const { frontMatter, content } = extractFrontMatter(rawContent);
// Extract metadata
const metadata = extractMetadata(filename, content, frontMatter);
// Convert to HTML
const htmlContent = markdownToHtml(content);
// Extract table of contents
const tableOfContents = extractTOC(content);
// Use slug from frontmatter if present, otherwise generate from title
const slug = metadata.slug || generateSlug(metadata.title);
// Determine if document should be public
// Internal document patterns (should NOT be public)
const internalPatterns = [
'session-handoff',
'phase-2',
'phase-3',
'testing',
'progress-report',
'blog-post-outlines',
'cost-estimates',
'deployment-guide',
'kickoff-checklist',
'preparation-advisory',
'soft-launch',
'implementation-session',
'test-suite'
];
// Check if filename or slug matches internal patterns
const isInternal = internalPatterns.some(pattern =>
filename.toLowerCase().includes(pattern) ||
slug.toLowerCase().includes(pattern)
);
// Determine visibility from front matter or default based on patterns
let visibility = frontMatter.visibility || null;
if (!visibility) {
// Check for legacy public field
if (frontMatter.public !== undefined) {
const isPublic = frontMatter.public === true || frontMatter.public === 'true';
visibility = isPublic ? 'public' : 'internal';
} else {
// Default to internal if matches internal patterns, otherwise public
visibility = isInternal ? 'internal' : 'public';
}
}
// Validate visibility value
const validVisibility = ['public', 'internal', 'confidential', 'archived'];
if (!validVisibility.includes(visibility)) {
console.warn(`Invalid visibility '${visibility}' for ${filename}, defaulting to 'internal'`);
visibility = 'internal';
}
// Determine category from front matter or metadata type
const category = frontMatter.category || metadata.type || 'none';
// Build document object matching Document model schema
const doc = {
title: metadata.title,
slug: slug,
quadrant: metadata.quadrant,
persistence: 'HIGH', // Default for technical documents
visibility: visibility,
category: category,
content_html: htmlContent,
content_markdown: content,
toc: tableOfContents,
metadata: {
author: metadata.author,
version: metadata.version,
document_code: metadata.identifier,
tags: metadata.tags,
original_filename: filename,
source_path: path.relative(sourcePath, filePath),
migrated_at: new Date()
},
search_index: content.toLowerCase(),
translations: {},
download_formats: {}
};
return doc;
}
/**
* Find all markdown files in directory
*/
async function findMarkdownFiles(dirPath) {
const files = [];
async function scan(dir) {
const entries = await fs.readdir(dir, { withFileTypes: true });
for (const entry of entries) {
const fullPath = path.join(dir, entry.name);
if (entry.isDirectory()) {
// Skip node_modules and hidden directories
if (!entry.name.startsWith('.') && entry.name !== 'node_modules') {
await scan(fullPath);
}
} else if (entry.isFile() && entry.name.endsWith('.md')) {
// Skip README files
if (!entry.name.toLowerCase().includes('readme')) {
files.push(fullPath);
}
}
}
}
await scan(dirPath);
return files;
}
/**
* Main migration function
*/
async function migrate() {
try {
console.log('\n=== Tractatus Document Migration ===\n');
// Determine source path
let sourcePath;
if (sourceArg !== -1 && args[sourceArg + 1]) {
sourcePath = args[sourceArg + 1];
} else {
// Check default sources
for (const defaultPath of DEFAULT_SOURCES) {
try {
const stat = await fs.stat(defaultPath);
if (stat.isDirectory()) {
const files = await fs.readdir(defaultPath);
if (files.length > 0) {
sourcePath = defaultPath;
break;
}
}
} catch (err) {
// Path doesn't exist, try next
}
}
}
if (!sourcePath) {
console.error('❌ No source path specified and no documents found in default locations.');
console.log('\nUsage: npm run migrate:docs -- --source /path/to/docs');
console.log('\nDefault locations checked:');
DEFAULT_SOURCES.forEach(p => console.log(` - ${p}`));
process.exit(1);
}
console.log(`📂 Source: ${sourcePath}`);
console.log(`🔍 Mode: ${dryRun ? 'DRY RUN (no changes)' : 'MIGRATION (will write to database)'}`);
console.log('');
// Find markdown files
const markdownFiles = await findMarkdownFiles(sourcePath);
if (markdownFiles.length === 0) {
console.log('⚠️ No markdown files found.');
process.exit(0);
}
console.log(`Found ${markdownFiles.length} markdown file(s):\n`);
markdownFiles.forEach((file, i) => {
console.log(` ${i + 1}. ${path.relative(sourcePath, file)}`);
});
console.log('');
if (!dryRun) {
// Connect to database
await connect();
}
// Process each file
let createdCount = 0;
let updatedCount = 0;
let skippedCount = 0;
let errorsCount = 0;
for (const filePath of markdownFiles) {
try {
const doc = await processMarkdownFile(filePath, sourcePath);
const filename = path.basename(filePath);
if (dryRun) {
console.log(`✓ [DRY RUN] ${filename}`);
console.log(` Title: ${doc.title}`);
console.log(` Slug: ${doc.slug}`);
console.log(` Quadrant: ${doc.quadrant || 'none'}`);
console.log(` Code: ${doc.metadata.document_code || 'none'}`);
console.log('');
createdCount++;
} else {
// Check if document already exists by slug
const existing = await Document.findBySlug(doc.slug);
if (existing && !force) {
console.log(`⊘ SKIPPED ${filename} (already exists: ${existing.slug})`);
skippedCount++;
} else if (existing && force) {
// Update existing document
const updatedDoc = await Document.update(existing._id, doc);
console.log(`↻ UPDATED ${filename} (${updatedDoc.slug})`);
updatedCount++;
} else {
// Create new document
const createdDoc = await Document.create(doc);
console.log(`✓ CREATED ${filename} (${createdDoc.slug})`);
createdCount++;
}
}
} catch (error) {
console.error(`✗ ERROR processing ${path.basename(filePath)}: ${error.message}`);
logger.error(`Migration error for ${filePath}:`, error);
errorsCount++;
}
}
// Summary
console.log('\n=== Migration Summary ===\n');
console.log(` Total files: ${markdownFiles.length}`);
console.log(` Created: ${createdCount}`);
console.log(` Updated: ${updatedCount}`);
console.log(` Skipped: ${skippedCount}`);
console.log(` Errors: ${errorsCount}`);
console.log('');
if (dryRun) {
console.log('💡 This was a dry run. No changes were made.');
console.log(' Run without --dry-run to perform actual migration.');
}
if (!dryRun) {
logger.info(`Document migration completed: ${createdCount} created, ${updatedCount} updated, ${skippedCount} skipped, ${errorsCount} errors`);
}
} catch (error) {
console.error('\n❌ Migration failed:', error.message);
logger.error('Migration error:', error);
process.exit(1);
} finally {
if (!dryRun) {
await close();
}
}
}
// Run if called directly
if (require.main === module) {
migrate();
}
module.exports = migrate;