- INCIDENT_REPORT_20260222: Deliberate instruction refusal analysis - fix-markdown-licences.js: Batch update licence sections in markdown - migrate-licence-to-cc-by-4.js: Apache 2.0 → CC BY 4.0 migration tool - publish-overtrust-blog-post.js: Blog post publishing utility - validate-licences.js: Licence compliance checker Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
316 lines
14 KiB
JavaScript
316 lines
14 KiB
JavaScript
#!/usr/bin/env node
|
|
/**
|
|
* Migrate Document Licences — Apache 2.0 → CC BY 4.0
|
|
*
|
|
* Updates MongoDB documents: replaces Apache 2.0 licence text in content_html
|
|
* and content_markdown for research papers. Sets the licence field on all documents.
|
|
*
|
|
* Usage:
|
|
* node scripts/migrate-licence-to-cc-by-4.js [--dry-run] [--db <name>]
|
|
*
|
|
* Defaults to tractatus_dev. Use --db tractatus for production.
|
|
*/
|
|
|
|
const { MongoClient } = require('mongodb');
|
|
|
|
const DRY_RUN = process.argv.includes('--dry-run');
|
|
const dbArg = process.argv.indexOf('--db');
|
|
const DB_NAME = dbArg !== -1 ? process.argv[dbArg + 1] : 'tractatus_dev';
|
|
|
|
// --- Classification Map ---
|
|
// Research papers → CC BY 4.0. Everything else → Apache 2.0.
|
|
// Uses partial matching: if any of these strings appear in the slug, it's CC BY 4.0.
|
|
const CC_BY_SLUGS = new Set([
|
|
'tractatus-framework-research',
|
|
'pluralistic-values-research-foundations',
|
|
'the-27027-incident-a-case-study-in-pattern-recognition-bias',
|
|
'real-world-ai-governance-a-case-study-in-framework-failure-and-recovery',
|
|
'research-topic-concurrent-session-architecture',
|
|
'research-topic-rule-proliferation-transactional-overhead',
|
|
'executive-summary-tractatus-inflection-point',
|
|
'value-pluralism-faq',
|
|
'value-pluralism-in-tractatus-frequently-asked-questions',
|
|
'tractatus-ai-safety-framework-core-values-and-principles',
|
|
'organizational-theory-foundations',
|
|
'glossary',
|
|
'glossary-de',
|
|
'glossary-fr',
|
|
'business-case-tractatus-framework',
|
|
'case-studies',
|
|
'steering-vectors-mechanical-bias-sovereign-ai',
|
|
'steering-vectors-and-mechanical-bias-inference-time-debiasing-for-sovereign-small-language-models',
|
|
'taonga-centred-steering-governance-polycentric-ai',
|
|
'taonga-centred-steering-governance-polycentric-authority-for-sovereign-small-language-models',
|
|
'pattern-bias-from-code-to-conversation',
|
|
'architectural-alignment-academic',
|
|
'philosophical-foundations-village-project',
|
|
'research-timeline',
|
|
'architectural-safeguards-against-llm-hierarchical-dominance-prose',
|
|
'case-studies-real-world-llm-failure-modes-appendix',
|
|
]);
|
|
|
|
function shouldBeCcBy(slug) {
|
|
return CC_BY_SLUGS.has(slug);
|
|
}
|
|
|
|
// --- Replacement strings ---
|
|
// We use simple string search-and-replace. More reliable than regex on messy HTML.
|
|
|
|
const APACHE_STRINGS_TO_FIND = {
|
|
en: [
|
|
// Full licence block text (the body, not the heading)
|
|
'Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.',
|
|
'Licensed under the Apache License, Version 2.0 (the "License");\nyou may not use this file except in compliance with the License.',
|
|
// Inline metadata variants
|
|
'**License:** Apache License 2.0',
|
|
'<strong>License:</strong> Apache License 2.0',
|
|
'<em>License: Apache License 2.0</em>',
|
|
'*License: Apache License 2.0*',
|
|
// Summary items
|
|
'Apache License, Version 2.0, January 2004',
|
|
],
|
|
de: [
|
|
// Full German single-line block (as found in glossary-de)
|
|
'Lizenziert unter der Apache License, Version 2.0 (die "Lizenz"); Sie d\u00fcrfen diese Datei nur in \u00dcbereinstimmung mit der Lizenz verwenden.',
|
|
// Shorter variant
|
|
'Lizenziert unter der Apache License, Version 2.0',
|
|
'lizenziert unter der Apache License, Version 2.0',
|
|
// Inline German metadata
|
|
'Apache-Lizenz 2.0',
|
|
],
|
|
fr: [
|
|
'Sous licence Apache License, Version 2.0',
|
|
'sous licence Apache License, Version 2.0',
|
|
'Licencié sous la Licence Apache, Version 2.0',
|
|
'Licence Apache 2.0',
|
|
// French typography (space before colon)
|
|
'Apache License 2.0</li>',
|
|
],
|
|
mi: [
|
|
'I raro i te Rāngai Apache, Putanga 2.0',
|
|
]
|
|
};
|
|
|
|
// What to check AFTER replacement — should not contain these (ignoring the dual-licence note)
|
|
function hasStrayApache(text) {
|
|
if (!text) return false;
|
|
// Remove the acceptable dual-licence note (various language forms)
|
|
const cleaned = text
|
|
.replace(/separately licensed under the Apache License 2\.0/g, '')
|
|
.replace(/separat unter der Apache License 2\.0 lizenziert/g, '')
|
|
.replace(/séparément sous la Licence Apache 2\.0/g, '')
|
|
.replace(/Apache License 2\.0\. This Creative Commons/g, '')
|
|
.replace(/Apache License 2\.0\. Diese Creative-Commons/g, '')
|
|
.replace(/Apache License 2\.0\. Cette licence Creative/g, '')
|
|
// Also acceptable: the framework code reference in any context
|
|
.replace(/source code is separately licensed under the Apache/g, '')
|
|
.replace(/Quellcode.*?Apache License 2\.0/g, '')
|
|
// Māori dual-licence note
|
|
.replace(/kei raro anō i te Apache License 2\.0/g, '');
|
|
return cleaned.includes('Apache License') || cleaned.includes('Apache-Lizenz');
|
|
}
|
|
|
|
async function main() {
|
|
console.log(`\n=== Licence Migration: Apache 2.0 → CC BY 4.0 ===`);
|
|
console.log(`Database: ${DB_NAME}`);
|
|
console.log(`Mode: ${DRY_RUN ? 'DRY RUN' : 'LIVE'}\n`);
|
|
|
|
const client = new MongoClient('mongodb://localhost:27017');
|
|
|
|
try {
|
|
await client.connect();
|
|
const db = client.db(DB_NAME);
|
|
const collection = db.collection('documents');
|
|
|
|
const documents = await collection.find({}).toArray();
|
|
console.log(`Found ${documents.length} documents in database\n`);
|
|
|
|
let updated = 0;
|
|
let warnings = 0;
|
|
|
|
for (const doc of documents) {
|
|
const slug = doc.slug;
|
|
const isCcBy = shouldBeCcBy(slug);
|
|
const licence = isCcBy ? 'CC-BY-4.0' : 'Apache-2.0';
|
|
|
|
const updates = { licence };
|
|
const changes = [];
|
|
|
|
if (isCcBy) {
|
|
const ccByHtml = `\n<p>Copyright \u00a9 2026 John Stroh.</p>\n<p>This work is licensed under the <a href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International Licence (CC BY 4.0)</a>.</p>\n<p>You are free to share, copy, redistribute, adapt, remix, transform, and build upon this material for any purpose, including commercially, provided you give appropriate attribution, provide a link to the licence, and indicate if changes were made.</p>\n<p><strong>Note:</strong> The Tractatus AI Safety Framework source code is separately licensed under the Apache License 2.0. This Creative Commons licence applies to the research paper text and figures only.</p>`;
|
|
const ccByMd = `\n\nCopyright \u00a9 2026 John Stroh.\n\nThis work is licensed under the [Creative Commons Attribution 4.0 International Licence (CC BY 4.0)](https://creativecommons.org/licenses/by/4.0/).\n\nYou are free to share, copy, redistribute, adapt, remix, transform, and build upon this material for any purpose, including commercially, provided you give appropriate attribution, provide a link to the licence, and indicate if changes were made.\n\n**Note:** The Tractatus AI Safety Framework source code is separately licensed under the Apache License 2.0. This Creative Commons licence applies to the research paper text and figures only.\n`;
|
|
|
|
// Process content_html
|
|
if (doc.content_html) {
|
|
let html = doc.content_html;
|
|
let changed = false;
|
|
// Apply ALL language needles (some documents mix languages)
|
|
const allNeedles = [...APACHE_STRINGS_TO_FIND.en, ...APACHE_STRINGS_TO_FIND.de, ...APACHE_STRINGS_TO_FIND.fr, ...APACHE_STRINGS_TO_FIND.mi];
|
|
for (const needle of allNeedles) {
|
|
if (html.includes(needle)) {
|
|
html = html.split(needle).join('');
|
|
changed = true;
|
|
}
|
|
}
|
|
// Replace the heading (may have id= attribute, e.g. <h2 id="license">)
|
|
const licenseHeadingRe = /<h2[^>]*>(?:Document )?License<\/h2>/i;
|
|
if (licenseHeadingRe.test(html)) {
|
|
html = html.replace(licenseHeadingRe, '<h2>Licence</h2>');
|
|
changed = true;
|
|
}
|
|
// Handle German/French headings — use [\s\S]*? to match through inner HTML elements
|
|
const lizenzHeadingRe = /<h2[^>]*>Lizenz[\s\S]*?<\/h2>/i;
|
|
if (lizenzHeadingRe.test(html)) {
|
|
html = html.replace(lizenzHeadingRe, '<h2>Lizenz</h2>');
|
|
changed = true;
|
|
}
|
|
const licenceHeadingRe = /<h2[^>]*>Licence[\s\S]*?<\/h2>/i;
|
|
if (licenceHeadingRe.test(html)) {
|
|
html = html.replace(licenceHeadingRe, '<h2>Licence</h2>');
|
|
changed = true;
|
|
}
|
|
|
|
// Check if CC BY 4.0 text already present (from a previous run)
|
|
const alreadyHasCcBy = html.includes('Creative Commons') || html.includes('CC BY 4.0');
|
|
|
|
if (!alreadyHasCcBy) {
|
|
if (changed) {
|
|
// Apache text was found and removed — insert CC BY 4.0 after the Licence heading
|
|
const licIdx = html.indexOf('<h2>Licence</h2>');
|
|
const lizIdx = html.indexOf('<h2>Lizenz</h2>');
|
|
const headingIdx = licIdx >= 0 ? licIdx : lizIdx;
|
|
if (headingIdx >= 0) {
|
|
const afterHeading = html.indexOf('</h2>', headingIdx) + 5;
|
|
html = html.substring(0, afterHeading) + ccByHtml + html.substring(afterHeading);
|
|
}
|
|
} else {
|
|
// No Apache text found AND no CC BY text present — append a licence section
|
|
html = html.trimEnd() + '\n<h2>Licence</h2>' + ccByHtml + '\n';
|
|
changed = true;
|
|
}
|
|
}
|
|
|
|
if (changed) {
|
|
updates.content_html = html;
|
|
changes.push('content_html');
|
|
}
|
|
// Check for remaining Apache references AFTER all replacements
|
|
if (hasStrayApache(updates.content_html || html)) {
|
|
changes.push('content_html:WARNING_STRAY_APACHE');
|
|
warnings++;
|
|
}
|
|
}
|
|
|
|
// Process content_markdown
|
|
if (doc.content_markdown) {
|
|
let md = doc.content_markdown;
|
|
let changed = false;
|
|
const allNeedles = [...APACHE_STRINGS_TO_FIND.en, ...APACHE_STRINGS_TO_FIND.de, ...APACHE_STRINGS_TO_FIND.fr, ...APACHE_STRINGS_TO_FIND.mi];
|
|
for (const needle of allNeedles) {
|
|
if (md.includes(needle)) {
|
|
md = md.split(needle).join('');
|
|
changed = true;
|
|
}
|
|
}
|
|
if (md.includes('## License') || md.includes('## Document License') || md.includes('## Lizenz')) {
|
|
md = md.replace(/## (?:Document )?License/, '## Licence');
|
|
md = md.replace(/## Lizenz(?:\s+Copyright)/, '## Licence\n\nCopyright');
|
|
changed = true;
|
|
}
|
|
|
|
// Check if CC BY 4.0 text already present
|
|
const alreadyHasCcBy = md.includes('Creative Commons') || md.includes('CC BY 4.0');
|
|
|
|
if (!alreadyHasCcBy) {
|
|
if (changed && md.includes('## Licence')) {
|
|
md = md.replace('## Licence\n', `## Licence${ccByMd}`);
|
|
} else if (!changed) {
|
|
// No Apache text found AND no CC BY text present — append a licence section
|
|
md = md.trimEnd() + '\n\n## Licence' + ccByMd;
|
|
changed = true;
|
|
}
|
|
}
|
|
|
|
if (changed) {
|
|
updates.content_markdown = md;
|
|
changes.push('content_markdown');
|
|
}
|
|
// Check AFTER all replacements
|
|
if (hasStrayApache(updates.content_markdown || md)) {
|
|
changes.push('content_markdown:WARNING_STRAY_APACHE');
|
|
warnings++;
|
|
}
|
|
}
|
|
|
|
// Process translations
|
|
if (doc.translations) {
|
|
for (const [lang, translation] of Object.entries(doc.translations)) {
|
|
const needles = APACHE_STRINGS_TO_FIND[lang] || APACHE_STRINGS_TO_FIND.en;
|
|
|
|
if (translation.content_html) {
|
|
let html = translation.content_html;
|
|
let changed = false;
|
|
// Apply both language-specific and English needles (some translations mix)
|
|
const allNeedles = [...needles, ...APACHE_STRINGS_TO_FIND.en];
|
|
for (const needle of allNeedles) {
|
|
if (html.includes(needle)) {
|
|
html = html.split(needle).join('');
|
|
changed = true;
|
|
}
|
|
}
|
|
if (changed) {
|
|
// Replace heading variants
|
|
html = html.replace(/<h2>Lizenz[^<]*<\/h2>/, '<h2>Lizenz</h2>');
|
|
html = html.replace(/<h2>Licence[^<]*<\/h2>/, '<h2>Licence</h2>');
|
|
html = html.replace(/<h2>License<\/h2>/, '<h2>Licence</h2>');
|
|
html = html.replace(/<h2>R\u0101ngai[^<]*<\/h2>/, '<h2>R\u0101ngai</h2>');
|
|
|
|
updates[`translations.${lang}.content_html`] = html;
|
|
changes.push(`translations.${lang}.content_html`);
|
|
}
|
|
if (hasStrayApache(html)) {
|
|
changes.push(`translations.${lang}:WARNING_STRAY_APACHE`);
|
|
warnings++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Only log if there are actual changes or it's a CC BY doc
|
|
if (changes.length > 0 || isCcBy) {
|
|
const status = changes.length > 0 ? changes.join(', ') : (isCcBy ? 'already correct or no licence block' : '');
|
|
console.log(`[${slug}] → ${licence} ${status ? '| ' + status : ''}`);
|
|
}
|
|
|
|
if (!DRY_RUN && Object.keys(updates).length > 0) {
|
|
await collection.updateOne({ _id: doc._id }, { $set: updates });
|
|
updated++;
|
|
} else if (Object.keys(updates).length > 0) {
|
|
updated++;
|
|
}
|
|
}
|
|
|
|
console.log(`\n--- Summary ---`);
|
|
console.log(`Total documents: ${documents.length}`);
|
|
console.log(`Updated: ${updated}`);
|
|
console.log(`Warnings (stray Apache text): ${warnings}`);
|
|
|
|
if (warnings > 0) {
|
|
console.log('\nWARNING: Some documents still contain Apache text after replacement.');
|
|
console.log('These may need manual review — the text may be in an unusual format.');
|
|
}
|
|
|
|
if (DRY_RUN) {
|
|
console.log('\nRe-run without --dry-run to apply changes.');
|
|
}
|
|
|
|
} finally {
|
|
await client.close();
|
|
}
|
|
}
|
|
|
|
main().catch(err => {
|
|
console.error('Fatal error:', err);
|
|
process.exit(1);
|
|
});
|