#!/usr/bin/env node /** * Migrate Document Licences — Apache 2.0 → CC BY 4.0 * * Updates MongoDB documents: replaces Apache 2.0 licence text in content_html * and content_markdown for research papers. Sets the licence field on all documents. * * Usage: * node scripts/migrate-licence-to-cc-by-4.js [--dry-run] [--db ] * * Defaults to tractatus_dev. Use --db tractatus for production. */ const { MongoClient } = require('mongodb'); const DRY_RUN = process.argv.includes('--dry-run'); const dbArg = process.argv.indexOf('--db'); const DB_NAME = dbArg !== -1 ? process.argv[dbArg + 1] : 'tractatus_dev'; // --- Classification Map --- // Research papers → CC BY 4.0. Everything else → Apache 2.0. // Uses partial matching: if any of these strings appear in the slug, it's CC BY 4.0. const CC_BY_SLUGS = new Set([ 'tractatus-framework-research', 'pluralistic-values-research-foundations', 'the-27027-incident-a-case-study-in-pattern-recognition-bias', 'real-world-ai-governance-a-case-study-in-framework-failure-and-recovery', 'research-topic-concurrent-session-architecture', 'research-topic-rule-proliferation-transactional-overhead', 'executive-summary-tractatus-inflection-point', 'value-pluralism-faq', 'value-pluralism-in-tractatus-frequently-asked-questions', 'tractatus-ai-safety-framework-core-values-and-principles', 'organizational-theory-foundations', 'glossary', 'glossary-de', 'glossary-fr', 'business-case-tractatus-framework', 'case-studies', 'steering-vectors-mechanical-bias-sovereign-ai', 'steering-vectors-and-mechanical-bias-inference-time-debiasing-for-sovereign-small-language-models', 'taonga-centred-steering-governance-polycentric-ai', 'taonga-centred-steering-governance-polycentric-authority-for-sovereign-small-language-models', 'pattern-bias-from-code-to-conversation', 'architectural-alignment-academic', 'philosophical-foundations-village-project', 'research-timeline', 'architectural-safeguards-against-llm-hierarchical-dominance-prose', 'case-studies-real-world-llm-failure-modes-appendix', ]); function shouldBeCcBy(slug) { return CC_BY_SLUGS.has(slug); } // --- Replacement strings --- // We use simple string search-and-replace. More reliable than regex on messy HTML. const APACHE_STRINGS_TO_FIND = { en: [ // Full licence block text (the body, not the heading) 'Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.', 'Licensed under the Apache License, Version 2.0 (the "License");\nyou may not use this file except in compliance with the License.', // Inline metadata variants '**License:** Apache License 2.0', 'License: Apache License 2.0', 'License: Apache License 2.0', '*License: Apache License 2.0*', // Summary items 'Apache License, Version 2.0, January 2004', ], de: [ // Full German single-line block (as found in glossary-de) 'Lizenziert unter der Apache License, Version 2.0 (die "Lizenz"); Sie d\u00fcrfen diese Datei nur in \u00dcbereinstimmung mit der Lizenz verwenden.', // Shorter variant 'Lizenziert unter der Apache License, Version 2.0', 'lizenziert unter der Apache License, Version 2.0', // Inline German metadata 'Apache-Lizenz 2.0', ], fr: [ 'Sous licence Apache License, Version 2.0', 'sous licence Apache License, Version 2.0', 'Licencié sous la Licence Apache, Version 2.0', 'Licence Apache 2.0', // French typography (space before colon) 'Apache License 2.0', ], mi: [ 'I raro i te Rāngai Apache, Putanga 2.0', ] }; // What to check AFTER replacement — should not contain these (ignoring the dual-licence note) function hasStrayApache(text) { if (!text) return false; // Remove the acceptable dual-licence note (various language forms) const cleaned = text .replace(/separately licensed under the Apache License 2\.0/g, '') .replace(/separat unter der Apache License 2\.0 lizenziert/g, '') .replace(/séparément sous la Licence Apache 2\.0/g, '') .replace(/Apache License 2\.0\. This Creative Commons/g, '') .replace(/Apache License 2\.0\. Diese Creative-Commons/g, '') .replace(/Apache License 2\.0\. Cette licence Creative/g, '') // Also acceptable: the framework code reference in any context .replace(/source code is separately licensed under the Apache/g, '') .replace(/Quellcode.*?Apache License 2\.0/g, '') // Māori dual-licence note .replace(/kei raro anō i te Apache License 2\.0/g, ''); return cleaned.includes('Apache License') || cleaned.includes('Apache-Lizenz'); } async function main() { console.log(`\n=== Licence Migration: Apache 2.0 → CC BY 4.0 ===`); console.log(`Database: ${DB_NAME}`); console.log(`Mode: ${DRY_RUN ? 'DRY RUN' : 'LIVE'}\n`); const client = new MongoClient('mongodb://localhost:27017'); try { await client.connect(); const db = client.db(DB_NAME); const collection = db.collection('documents'); const documents = await collection.find({}).toArray(); console.log(`Found ${documents.length} documents in database\n`); let updated = 0; let warnings = 0; for (const doc of documents) { const slug = doc.slug; const isCcBy = shouldBeCcBy(slug); const licence = isCcBy ? 'CC-BY-4.0' : 'Apache-2.0'; const updates = { licence }; const changes = []; if (isCcBy) { const ccByHtml = `\n

Copyright \u00a9 2026 John Stroh.

\n

This work is licensed under the Creative Commons Attribution 4.0 International Licence (CC BY 4.0).

\n

You are free to share, copy, redistribute, adapt, remix, transform, and build upon this material for any purpose, including commercially, provided you give appropriate attribution, provide a link to the licence, and indicate if changes were made.

\n

Note: The Tractatus AI Safety Framework source code is separately licensed under the Apache License 2.0. This Creative Commons licence applies to the research paper text and figures only.

`; const ccByMd = `\n\nCopyright \u00a9 2026 John Stroh.\n\nThis work is licensed under the [Creative Commons Attribution 4.0 International Licence (CC BY 4.0)](https://creativecommons.org/licenses/by/4.0/).\n\nYou are free to share, copy, redistribute, adapt, remix, transform, and build upon this material for any purpose, including commercially, provided you give appropriate attribution, provide a link to the licence, and indicate if changes were made.\n\n**Note:** The Tractatus AI Safety Framework source code is separately licensed under the Apache License 2.0. This Creative Commons licence applies to the research paper text and figures only.\n`; // Process content_html if (doc.content_html) { let html = doc.content_html; let changed = false; // Apply ALL language needles (some documents mix languages) const allNeedles = [...APACHE_STRINGS_TO_FIND.en, ...APACHE_STRINGS_TO_FIND.de, ...APACHE_STRINGS_TO_FIND.fr, ...APACHE_STRINGS_TO_FIND.mi]; for (const needle of allNeedles) { if (html.includes(needle)) { html = html.split(needle).join(''); changed = true; } } // Replace the heading (may have id= attribute, e.g.

) const licenseHeadingRe = /]*>(?:Document )?License<\/h2>/i; if (licenseHeadingRe.test(html)) { html = html.replace(licenseHeadingRe, '

Licence

'); changed = true; } // Handle German/French headings — use [\s\S]*? to match through inner HTML elements const lizenzHeadingRe = /]*>Lizenz[\s\S]*?<\/h2>/i; if (lizenzHeadingRe.test(html)) { html = html.replace(lizenzHeadingRe, '

Lizenz

'); changed = true; } const licenceHeadingRe = /]*>Licence[\s\S]*?<\/h2>/i; if (licenceHeadingRe.test(html)) { html = html.replace(licenceHeadingRe, '

Licence

'); changed = true; } // Check if CC BY 4.0 text already present (from a previous run) const alreadyHasCcBy = html.includes('Creative Commons') || html.includes('CC BY 4.0'); if (!alreadyHasCcBy) { if (changed) { // Apache text was found and removed — insert CC BY 4.0 after the Licence heading const licIdx = html.indexOf('

Licence

'); const lizIdx = html.indexOf('

Lizenz

'); const headingIdx = licIdx >= 0 ? licIdx : lizIdx; if (headingIdx >= 0) { const afterHeading = html.indexOf('

', headingIdx) + 5; html = html.substring(0, afterHeading) + ccByHtml + html.substring(afterHeading); } } else { // No Apache text found AND no CC BY text present — append a licence section html = html.trimEnd() + '\n

Licence

' + ccByHtml + '\n'; changed = true; } } if (changed) { updates.content_html = html; changes.push('content_html'); } // Check for remaining Apache references AFTER all replacements if (hasStrayApache(updates.content_html || html)) { changes.push('content_html:WARNING_STRAY_APACHE'); warnings++; } } // Process content_markdown if (doc.content_markdown) { let md = doc.content_markdown; let changed = false; const allNeedles = [...APACHE_STRINGS_TO_FIND.en, ...APACHE_STRINGS_TO_FIND.de, ...APACHE_STRINGS_TO_FIND.fr, ...APACHE_STRINGS_TO_FIND.mi]; for (const needle of allNeedles) { if (md.includes(needle)) { md = md.split(needle).join(''); changed = true; } } if (md.includes('## License') || md.includes('## Document License') || md.includes('## Lizenz')) { md = md.replace(/## (?:Document )?License/, '## Licence'); md = md.replace(/## Lizenz(?:\s+Copyright)/, '## Licence\n\nCopyright'); changed = true; } // Check if CC BY 4.0 text already present const alreadyHasCcBy = md.includes('Creative Commons') || md.includes('CC BY 4.0'); if (!alreadyHasCcBy) { if (changed && md.includes('## Licence')) { md = md.replace('## Licence\n', `## Licence${ccByMd}`); } else if (!changed) { // No Apache text found AND no CC BY text present — append a licence section md = md.trimEnd() + '\n\n## Licence' + ccByMd; changed = true; } } if (changed) { updates.content_markdown = md; changes.push('content_markdown'); } // Check AFTER all replacements if (hasStrayApache(updates.content_markdown || md)) { changes.push('content_markdown:WARNING_STRAY_APACHE'); warnings++; } } // Process translations if (doc.translations) { for (const [lang, translation] of Object.entries(doc.translations)) { const needles = APACHE_STRINGS_TO_FIND[lang] || APACHE_STRINGS_TO_FIND.en; if (translation.content_html) { let html = translation.content_html; let changed = false; // Apply both language-specific and English needles (some translations mix) const allNeedles = [...needles, ...APACHE_STRINGS_TO_FIND.en]; for (const needle of allNeedles) { if (html.includes(needle)) { html = html.split(needle).join(''); changed = true; } } if (changed) { // Replace heading variants html = html.replace(/

Lizenz[^<]*<\/h2>/, '

Lizenz

'); html = html.replace(/

Licence[^<]*<\/h2>/, '

Licence

'); html = html.replace(/

License<\/h2>/, '

Licence

'); html = html.replace(/

R\u0101ngai[^<]*<\/h2>/, '

R\u0101ngai

'); updates[`translations.${lang}.content_html`] = html; changes.push(`translations.${lang}.content_html`); } if (hasStrayApache(html)) { changes.push(`translations.${lang}:WARNING_STRAY_APACHE`); warnings++; } } } } } // Only log if there are actual changes or it's a CC BY doc if (changes.length > 0 || isCcBy) { const status = changes.length > 0 ? changes.join(', ') : (isCcBy ? 'already correct or no licence block' : ''); console.log(`[${slug}] → ${licence} ${status ? '| ' + status : ''}`); } if (!DRY_RUN && Object.keys(updates).length > 0) { await collection.updateOne({ _id: doc._id }, { $set: updates }); updated++; } else if (Object.keys(updates).length > 0) { updated++; } } console.log(`\n--- Summary ---`); console.log(`Total documents: ${documents.length}`); console.log(`Updated: ${updated}`); console.log(`Warnings (stray Apache text): ${warnings}`); if (warnings > 0) { console.log('\nWARNING: Some documents still contain Apache text after replacement.'); console.log('These may need manual review — the text may be in an unusual format.'); } if (DRY_RUN) { console.log('\nRe-run without --dry-run to apply changes.'); } } finally { await client.close(); } } main().catch(err => { console.error('Fatal error:', err); process.exit(1); });