tractatus/scripts/migrate-licence-to-cc-by-4.js
TheFlow f1544e2b42 docs: Add incident report and licence migration utility scripts
- INCIDENT_REPORT_20260222: Deliberate instruction refusal analysis
- fix-markdown-licences.js: Batch update licence sections in markdown
- migrate-licence-to-cc-by-4.js: Apache 2.0 → CC BY 4.0 migration tool
- publish-overtrust-blog-post.js: Blog post publishing utility
- validate-licences.js: Licence compliance checker

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-22 16:36:41 +13:00

316 lines
14 KiB
JavaScript

#!/usr/bin/env node
/**
* Migrate Document Licences — Apache 2.0 → CC BY 4.0
*
* Updates MongoDB documents: replaces Apache 2.0 licence text in content_html
* and content_markdown for research papers. Sets the licence field on all documents.
*
* Usage:
* node scripts/migrate-licence-to-cc-by-4.js [--dry-run] [--db <name>]
*
* Defaults to tractatus_dev. Use --db tractatus for production.
*/
const { MongoClient } = require('mongodb');
const DRY_RUN = process.argv.includes('--dry-run');
const dbArg = process.argv.indexOf('--db');
const DB_NAME = dbArg !== -1 ? process.argv[dbArg + 1] : 'tractatus_dev';
// --- Classification Map ---
// Research papers → CC BY 4.0. Everything else → Apache 2.0.
// Uses partial matching: if any of these strings appear in the slug, it's CC BY 4.0.
const CC_BY_SLUGS = new Set([
'tractatus-framework-research',
'pluralistic-values-research-foundations',
'the-27027-incident-a-case-study-in-pattern-recognition-bias',
'real-world-ai-governance-a-case-study-in-framework-failure-and-recovery',
'research-topic-concurrent-session-architecture',
'research-topic-rule-proliferation-transactional-overhead',
'executive-summary-tractatus-inflection-point',
'value-pluralism-faq',
'value-pluralism-in-tractatus-frequently-asked-questions',
'tractatus-ai-safety-framework-core-values-and-principles',
'organizational-theory-foundations',
'glossary',
'glossary-de',
'glossary-fr',
'business-case-tractatus-framework',
'case-studies',
'steering-vectors-mechanical-bias-sovereign-ai',
'steering-vectors-and-mechanical-bias-inference-time-debiasing-for-sovereign-small-language-models',
'taonga-centred-steering-governance-polycentric-ai',
'taonga-centred-steering-governance-polycentric-authority-for-sovereign-small-language-models',
'pattern-bias-from-code-to-conversation',
'architectural-alignment-academic',
'philosophical-foundations-village-project',
'research-timeline',
'architectural-safeguards-against-llm-hierarchical-dominance-prose',
'case-studies-real-world-llm-failure-modes-appendix',
]);
function shouldBeCcBy(slug) {
return CC_BY_SLUGS.has(slug);
}
// --- Replacement strings ---
// We use simple string search-and-replace. More reliable than regex on messy HTML.
const APACHE_STRINGS_TO_FIND = {
en: [
// Full licence block text (the body, not the heading)
'Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.',
'Licensed under the Apache License, Version 2.0 (the "License");\nyou may not use this file except in compliance with the License.',
// Inline metadata variants
'**License:** Apache License 2.0',
'<strong>License:</strong> Apache License 2.0',
'<em>License: Apache License 2.0</em>',
'*License: Apache License 2.0*',
// Summary items
'Apache License, Version 2.0, January 2004',
],
de: [
// Full German single-line block (as found in glossary-de)
'Lizenziert unter der Apache License, Version 2.0 (die "Lizenz"); Sie d\u00fcrfen diese Datei nur in \u00dcbereinstimmung mit der Lizenz verwenden.',
// Shorter variant
'Lizenziert unter der Apache License, Version 2.0',
'lizenziert unter der Apache License, Version 2.0',
// Inline German metadata
'Apache-Lizenz 2.0',
],
fr: [
'Sous licence Apache License, Version 2.0',
'sous licence Apache License, Version 2.0',
'Licencié sous la Licence Apache, Version 2.0',
'Licence Apache 2.0',
// French typography (space before colon)
'Apache License 2.0</li>',
],
mi: [
'I raro i te Rāngai Apache, Putanga 2.0',
]
};
// What to check AFTER replacement — should not contain these (ignoring the dual-licence note)
function hasStrayApache(text) {
if (!text) return false;
// Remove the acceptable dual-licence note (various language forms)
const cleaned = text
.replace(/separately licensed under the Apache License 2\.0/g, '')
.replace(/separat unter der Apache License 2\.0 lizenziert/g, '')
.replace(/séparément sous la Licence Apache 2\.0/g, '')
.replace(/Apache License 2\.0\. This Creative Commons/g, '')
.replace(/Apache License 2\.0\. Diese Creative-Commons/g, '')
.replace(/Apache License 2\.0\. Cette licence Creative/g, '')
// Also acceptable: the framework code reference in any context
.replace(/source code is separately licensed under the Apache/g, '')
.replace(/Quellcode.*?Apache License 2\.0/g, '')
// Māori dual-licence note
.replace(/kei raro anō i te Apache License 2\.0/g, '');
return cleaned.includes('Apache License') || cleaned.includes('Apache-Lizenz');
}
async function main() {
console.log(`\n=== Licence Migration: Apache 2.0 → CC BY 4.0 ===`);
console.log(`Database: ${DB_NAME}`);
console.log(`Mode: ${DRY_RUN ? 'DRY RUN' : 'LIVE'}\n`);
const client = new MongoClient('mongodb://localhost:27017');
try {
await client.connect();
const db = client.db(DB_NAME);
const collection = db.collection('documents');
const documents = await collection.find({}).toArray();
console.log(`Found ${documents.length} documents in database\n`);
let updated = 0;
let warnings = 0;
for (const doc of documents) {
const slug = doc.slug;
const isCcBy = shouldBeCcBy(slug);
const licence = isCcBy ? 'CC-BY-4.0' : 'Apache-2.0';
const updates = { licence };
const changes = [];
if (isCcBy) {
const ccByHtml = `\n<p>Copyright \u00a9 2026 John Stroh.</p>\n<p>This work is licensed under the <a href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International Licence (CC BY 4.0)</a>.</p>\n<p>You are free to share, copy, redistribute, adapt, remix, transform, and build upon this material for any purpose, including commercially, provided you give appropriate attribution, provide a link to the licence, and indicate if changes were made.</p>\n<p><strong>Note:</strong> The Tractatus AI Safety Framework source code is separately licensed under the Apache License 2.0. This Creative Commons licence applies to the research paper text and figures only.</p>`;
const ccByMd = `\n\nCopyright \u00a9 2026 John Stroh.\n\nThis work is licensed under the [Creative Commons Attribution 4.0 International Licence (CC BY 4.0)](https://creativecommons.org/licenses/by/4.0/).\n\nYou are free to share, copy, redistribute, adapt, remix, transform, and build upon this material for any purpose, including commercially, provided you give appropriate attribution, provide a link to the licence, and indicate if changes were made.\n\n**Note:** The Tractatus AI Safety Framework source code is separately licensed under the Apache License 2.0. This Creative Commons licence applies to the research paper text and figures only.\n`;
// Process content_html
if (doc.content_html) {
let html = doc.content_html;
let changed = false;
// Apply ALL language needles (some documents mix languages)
const allNeedles = [...APACHE_STRINGS_TO_FIND.en, ...APACHE_STRINGS_TO_FIND.de, ...APACHE_STRINGS_TO_FIND.fr, ...APACHE_STRINGS_TO_FIND.mi];
for (const needle of allNeedles) {
if (html.includes(needle)) {
html = html.split(needle).join('');
changed = true;
}
}
// Replace the heading (may have id= attribute, e.g. <h2 id="license">)
const licenseHeadingRe = /<h2[^>]*>(?:Document )?License<\/h2>/i;
if (licenseHeadingRe.test(html)) {
html = html.replace(licenseHeadingRe, '<h2>Licence</h2>');
changed = true;
}
// Handle German/French headings — use [\s\S]*? to match through inner HTML elements
const lizenzHeadingRe = /<h2[^>]*>Lizenz[\s\S]*?<\/h2>/i;
if (lizenzHeadingRe.test(html)) {
html = html.replace(lizenzHeadingRe, '<h2>Lizenz</h2>');
changed = true;
}
const licenceHeadingRe = /<h2[^>]*>Licence[\s\S]*?<\/h2>/i;
if (licenceHeadingRe.test(html)) {
html = html.replace(licenceHeadingRe, '<h2>Licence</h2>');
changed = true;
}
// Check if CC BY 4.0 text already present (from a previous run)
const alreadyHasCcBy = html.includes('Creative Commons') || html.includes('CC BY 4.0');
if (!alreadyHasCcBy) {
if (changed) {
// Apache text was found and removed — insert CC BY 4.0 after the Licence heading
const licIdx = html.indexOf('<h2>Licence</h2>');
const lizIdx = html.indexOf('<h2>Lizenz</h2>');
const headingIdx = licIdx >= 0 ? licIdx : lizIdx;
if (headingIdx >= 0) {
const afterHeading = html.indexOf('</h2>', headingIdx) + 5;
html = html.substring(0, afterHeading) + ccByHtml + html.substring(afterHeading);
}
} else {
// No Apache text found AND no CC BY text present — append a licence section
html = html.trimEnd() + '\n<h2>Licence</h2>' + ccByHtml + '\n';
changed = true;
}
}
if (changed) {
updates.content_html = html;
changes.push('content_html');
}
// Check for remaining Apache references AFTER all replacements
if (hasStrayApache(updates.content_html || html)) {
changes.push('content_html:WARNING_STRAY_APACHE');
warnings++;
}
}
// Process content_markdown
if (doc.content_markdown) {
let md = doc.content_markdown;
let changed = false;
const allNeedles = [...APACHE_STRINGS_TO_FIND.en, ...APACHE_STRINGS_TO_FIND.de, ...APACHE_STRINGS_TO_FIND.fr, ...APACHE_STRINGS_TO_FIND.mi];
for (const needle of allNeedles) {
if (md.includes(needle)) {
md = md.split(needle).join('');
changed = true;
}
}
if (md.includes('## License') || md.includes('## Document License') || md.includes('## Lizenz')) {
md = md.replace(/## (?:Document )?License/, '## Licence');
md = md.replace(/## Lizenz(?:\s+Copyright)/, '## Licence\n\nCopyright');
changed = true;
}
// Check if CC BY 4.0 text already present
const alreadyHasCcBy = md.includes('Creative Commons') || md.includes('CC BY 4.0');
if (!alreadyHasCcBy) {
if (changed && md.includes('## Licence')) {
md = md.replace('## Licence\n', `## Licence${ccByMd}`);
} else if (!changed) {
// No Apache text found AND no CC BY text present — append a licence section
md = md.trimEnd() + '\n\n## Licence' + ccByMd;
changed = true;
}
}
if (changed) {
updates.content_markdown = md;
changes.push('content_markdown');
}
// Check AFTER all replacements
if (hasStrayApache(updates.content_markdown || md)) {
changes.push('content_markdown:WARNING_STRAY_APACHE');
warnings++;
}
}
// Process translations
if (doc.translations) {
for (const [lang, translation] of Object.entries(doc.translations)) {
const needles = APACHE_STRINGS_TO_FIND[lang] || APACHE_STRINGS_TO_FIND.en;
if (translation.content_html) {
let html = translation.content_html;
let changed = false;
// Apply both language-specific and English needles (some translations mix)
const allNeedles = [...needles, ...APACHE_STRINGS_TO_FIND.en];
for (const needle of allNeedles) {
if (html.includes(needle)) {
html = html.split(needle).join('');
changed = true;
}
}
if (changed) {
// Replace heading variants
html = html.replace(/<h2>Lizenz[^<]*<\/h2>/, '<h2>Lizenz</h2>');
html = html.replace(/<h2>Licence[^<]*<\/h2>/, '<h2>Licence</h2>');
html = html.replace(/<h2>License<\/h2>/, '<h2>Licence</h2>');
html = html.replace(/<h2>R\u0101ngai[^<]*<\/h2>/, '<h2>R\u0101ngai</h2>');
updates[`translations.${lang}.content_html`] = html;
changes.push(`translations.${lang}.content_html`);
}
if (hasStrayApache(html)) {
changes.push(`translations.${lang}:WARNING_STRAY_APACHE`);
warnings++;
}
}
}
}
}
// Only log if there are actual changes or it's a CC BY doc
if (changes.length > 0 || isCcBy) {
const status = changes.length > 0 ? changes.join(', ') : (isCcBy ? 'already correct or no licence block' : '');
console.log(`[${slug}] → ${licence} ${status ? '| ' + status : ''}`);
}
if (!DRY_RUN && Object.keys(updates).length > 0) {
await collection.updateOne({ _id: doc._id }, { $set: updates });
updated++;
} else if (Object.keys(updates).length > 0) {
updated++;
}
}
console.log(`\n--- Summary ---`);
console.log(`Total documents: ${documents.length}`);
console.log(`Updated: ${updated}`);
console.log(`Warnings (stray Apache text): ${warnings}`);
if (warnings > 0) {
console.log('\nWARNING: Some documents still contain Apache text after replacement.');
console.log('These may need manual review — the text may be in an unusual format.');
}
if (DRY_RUN) {
console.log('\nRe-run without --dry-run to apply changes.');
}
} finally {
await client.close();
}
}
main().catch(err => {
console.error('Fatal error:', err);
process.exit(1);
});