feat: Replace binary comparison with defence-in-depth 3-layer model
The old section presented behavioral training as all-bad and structural enforcement as all-good, which is intellectually dishonest given our planned SLL with BoundaryEnforcer in the training loop. Replaced with three stacked layer cards (training, architecture, human oversight), each showing strengths, known limitations, and status. Added insight blockquote and "Measured, Not Assumed" metrics grid with 6 commitments. Updated hero text for narrative consistency. All i18n in EN/DE/FR. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
8e7c70cbb7
commit
3ad1a5b953
4 changed files with 284 additions and 120 deletions
|
|
@ -114,81 +114,155 @@
|
|||
<!-- Main Content -->
|
||||
<main id="main-content" role="main">
|
||||
|
||||
<!-- Why External Enforcement Matters -->
|
||||
<!-- Defence in Depth: Three Layers of Governance -->
|
||||
<section class="max-w-7xl mx-auto px-4 sm:px-6 lg:px-8 py-16">
|
||||
<h2 class="text-4xl font-bold text-gray-900 mb-8 text-center" data-i18n="comparison.heading"></h2>
|
||||
<h2 class="text-4xl font-bold text-gray-900 mb-10 text-center" data-i18n="defence_in_depth.heading"></h2>
|
||||
|
||||
<div class="grid grid-cols-1 md:grid-cols-2 gap-8 mb-12">
|
||||
<div class="space-y-6 mb-12">
|
||||
|
||||
<div class="bg-red-50 border-l-4 border-red-500 p-6 rounded-r-lg">
|
||||
<h3 class="text-2xl font-bold text-red-900 mb-3 flex items-center" data-i18n="comparison.behavioral_title">
|
||||
<svg class="w-6 h-6 mr-2" fill="currentColor" viewBox="0 0 20 20">
|
||||
<path fill-rule="evenodd" d="M13.477 14.89A6 6 0 015.11 6.524l8.367 8.368zm1.414-1.414L6.524 5.11a6 6 0 018.367 8.367zM18 10a8 8 0 11-16 0 8 8 0 0116 0z" clip-rule="evenodd"/>
|
||||
</svg>
|
||||
Behavioral Training (Constitutional AI)
|
||||
</h3>
|
||||
<ul class="space-y-2 text-red-800">
|
||||
<li class="flex items-start">
|
||||
<span class="mr-2">❌</span>
|
||||
<span data-i18n-html="comparison.behavioral_item1"></span>
|
||||
</li>
|
||||
<li class="flex items-start">
|
||||
<span class="mr-2">❌</span>
|
||||
<span data-i18n-html="comparison.behavioral_item2"></span>
|
||||
</li>
|
||||
<li class="flex items-start">
|
||||
<span class="mr-2">❌</span>
|
||||
<span data-i18n-html="comparison.behavioral_item3"></span>
|
||||
</li>
|
||||
<li class="flex items-start">
|
||||
<span class="mr-2">❌</span>
|
||||
<span data-i18n-html="comparison.behavioral_item4"></span>
|
||||
</li>
|
||||
<li class="flex items-start">
|
||||
<span class="mr-2">❌</span>
|
||||
<span data-i18n-html="comparison.behavioral_item5"></span>
|
||||
</li>
|
||||
</ul>
|
||||
<!-- Layer 1: Behavioral Training -->
|
||||
<div class="bg-blue-50 rounded-xl p-6 md:p-8 border border-blue-200">
|
||||
<div class="flex items-start mb-4">
|
||||
<div class="flex-shrink-0 w-12 h-12 bg-gradient-to-br from-blue-500 to-blue-600 rounded-xl flex items-center justify-center text-white text-lg font-bold mr-4">1</div>
|
||||
<div>
|
||||
<h3 class="text-2xl font-bold text-blue-900" data-i18n="defence_in_depth.layer1_title"></h3>
|
||||
<p class="text-blue-700 italic mt-1" data-i18n="defence_in_depth.layer1_role"></p>
|
||||
</div>
|
||||
</div>
|
||||
<div class="grid grid-cols-1 md:grid-cols-3 gap-4 mb-4">
|
||||
<div class="flex items-start">
|
||||
<svg class="w-5 h-5 text-blue-600 mr-2 mt-0.5 flex-shrink-0" fill="currentColor" viewBox="0 0 20 20"><path fill-rule="evenodd" d="M10 18a8 8 0 100-16 8 8 0 000 16zm3.707-9.293a1 1 0 00-1.414-1.414L9 10.586 7.707 9.293a1 1 0 00-1.414 1.414l2 2a1 1 0 001.414 0l4-4z"/></svg>
|
||||
<span class="text-gray-700 text-sm" data-i18n="defence_in_depth.layer1_strength1"></span>
|
||||
</div>
|
||||
<div class="flex items-start">
|
||||
<svg class="w-5 h-5 text-blue-600 mr-2 mt-0.5 flex-shrink-0" fill="currentColor" viewBox="0 0 20 20"><path fill-rule="evenodd" d="M10 18a8 8 0 100-16 8 8 0 000 16zm3.707-9.293a1 1 0 00-1.414-1.414L9 10.586 7.707 9.293a1 1 0 00-1.414 1.414l2 2a1 1 0 001.414 0l4-4z"/></svg>
|
||||
<span class="text-gray-700 text-sm" data-i18n="defence_in_depth.layer1_strength2"></span>
|
||||
</div>
|
||||
<div class="flex items-start">
|
||||
<svg class="w-5 h-5 text-blue-600 mr-2 mt-0.5 flex-shrink-0" fill="currentColor" viewBox="0 0 20 20"><path fill-rule="evenodd" d="M10 18a8 8 0 100-16 8 8 0 000 16zm3.707-9.293a1 1 0 00-1.414-1.414L9 10.586 7.707 9.293a1 1 0 00-1.414 1.414l2 2a1 1 0 001.414 0l4-4z"/></svg>
|
||||
<span class="text-gray-700 text-sm" data-i18n="defence_in_depth.layer1_strength3"></span>
|
||||
</div>
|
||||
</div>
|
||||
<div class="flex flex-col sm:flex-row sm:items-center sm:justify-between gap-3">
|
||||
<div class="flex items-start">
|
||||
<svg class="w-5 h-5 text-amber-500 mr-2 mt-0.5 flex-shrink-0" fill="currentColor" viewBox="0 0 20 20"><path fill-rule="evenodd" d="M8.257 3.099c.765-1.36 2.722-1.36 3.486 0l5.58 9.92c.75 1.334-.213 2.98-1.742 2.98H4.42c-1.53 0-2.493-1.646-1.743-2.98l5.58-9.92zM11 13a1 1 0 11-2 0 1 1 0 012 0zm-1-8a1 1 0 00-1 1v3a1 1 0 002 0V6a1 1 0 00-1-1z" clip-rule="evenodd"/></svg>
|
||||
<span class="text-gray-600 text-sm"><strong>Known limitation:</strong> <span data-i18n="defence_in_depth.layer1_limitation"></span></span>
|
||||
</div>
|
||||
<span class="inline-flex items-center px-3 py-1 rounded-full text-xs font-medium bg-blue-100 text-blue-800 whitespace-nowrap" data-i18n="defence_in_depth.layer1_status"></span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="bg-green-50 border-l-4 border-green-500 p-6 rounded-r-lg">
|
||||
<h3 class="text-2xl font-bold text-green-900 mb-3 flex items-center">
|
||||
<svg class="w-6 h-6 mr-2" fill="currentColor" viewBox="0 0 20 20">
|
||||
<path fill-rule="evenodd" d="M2.166 4.999A11.954 11.954 0 0010 1.944 11.954 11.954 0 0017.834 5c.11.65.166 1.32.166 2.001 0 5.225-3.34 9.67-8 11.317C5.34 16.67 2 12.225 2 7c0-.682.057-1.35.166-2.001zm11.541 3.708a1 1 0 00-1.414-1.414L9 10.586 7.707 9.293a1 1 0 00-1.414 1.414l2 2a1 1 0 001.414 0l4-4z" clip-rule="evenodd"/>
|
||||
</svg>
|
||||
Structural Enforcement (Tractatus)
|
||||
</h3>
|
||||
<ul class="space-y-2 text-green-800">
|
||||
<li class="flex items-start">
|
||||
<span class="mr-2">✅</span>
|
||||
<span data-i18n-html="comparison.structural_item1"></span>
|
||||
</li>
|
||||
<li class="flex items-start">
|
||||
<span class="mr-2">✅</span>
|
||||
<span data-i18n-html="comparison.structural_item2"></span>
|
||||
</li>
|
||||
<li class="flex items-start">
|
||||
<span class="mr-2">✅</span>
|
||||
<span data-i18n-html="comparison.structural_item3"></span>
|
||||
</li>
|
||||
<li class="flex items-start">
|
||||
<span class="mr-2">✅</span>
|
||||
<span data-i18n-html="comparison.structural_item4"></span>
|
||||
</li>
|
||||
<li class="flex items-start">
|
||||
<span class="mr-2">✅</span>
|
||||
<span data-i18n-html="comparison.structural_item5"></span>
|
||||
</li>
|
||||
</ul>
|
||||
<!-- Layer 2: Structural Enforcement -->
|
||||
<div class="bg-emerald-50 rounded-xl p-6 md:p-8 border border-emerald-200">
|
||||
<div class="flex items-start mb-4">
|
||||
<div class="flex-shrink-0 w-12 h-12 bg-gradient-to-br from-emerald-500 to-emerald-600 rounded-xl flex items-center justify-center text-white text-lg font-bold mr-4">2</div>
|
||||
<div>
|
||||
<h3 class="text-2xl font-bold text-emerald-900" data-i18n="defence_in_depth.layer2_title"></h3>
|
||||
<p class="text-emerald-700 italic mt-1" data-i18n="defence_in_depth.layer2_role"></p>
|
||||
</div>
|
||||
</div>
|
||||
<div class="grid grid-cols-1 md:grid-cols-3 gap-4 mb-4">
|
||||
<div class="flex items-start">
|
||||
<svg class="w-5 h-5 text-emerald-600 mr-2 mt-0.5 flex-shrink-0" fill="currentColor" viewBox="0 0 20 20"><path fill-rule="evenodd" d="M10 18a8 8 0 100-16 8 8 0 000 16zm3.707-9.293a1 1 0 00-1.414-1.414L9 10.586 7.707 9.293a1 1 0 00-1.414 1.414l2 2a1 1 0 001.414 0l4-4z"/></svg>
|
||||
<span class="text-gray-700 text-sm" data-i18n="defence_in_depth.layer2_strength1"></span>
|
||||
</div>
|
||||
<div class="flex items-start">
|
||||
<svg class="w-5 h-5 text-emerald-600 mr-2 mt-0.5 flex-shrink-0" fill="currentColor" viewBox="0 0 20 20"><path fill-rule="evenodd" d="M10 18a8 8 0 100-16 8 8 0 000 16zm3.707-9.293a1 1 0 00-1.414-1.414L9 10.586 7.707 9.293a1 1 0 00-1.414 1.414l2 2a1 1 0 001.414 0l4-4z"/></svg>
|
||||
<span class="text-gray-700 text-sm" data-i18n="defence_in_depth.layer2_strength2"></span>
|
||||
</div>
|
||||
<div class="flex items-start">
|
||||
<svg class="w-5 h-5 text-emerald-600 mr-2 mt-0.5 flex-shrink-0" fill="currentColor" viewBox="0 0 20 20"><path fill-rule="evenodd" d="M10 18a8 8 0 100-16 8 8 0 000 16zm3.707-9.293a1 1 0 00-1.414-1.414L9 10.586 7.707 9.293a1 1 0 00-1.414 1.414l2 2a1 1 0 001.414 0l4-4z"/></svg>
|
||||
<span class="text-gray-700 text-sm" data-i18n="defence_in_depth.layer2_strength3"></span>
|
||||
</div>
|
||||
</div>
|
||||
<div class="flex flex-col sm:flex-row sm:items-center sm:justify-between gap-3">
|
||||
<div class="flex items-start">
|
||||
<svg class="w-5 h-5 text-amber-500 mr-2 mt-0.5 flex-shrink-0" fill="currentColor" viewBox="0 0 20 20"><path fill-rule="evenodd" d="M8.257 3.099c.765-1.36 2.722-1.36 3.486 0l5.58 9.92c.75 1.334-.213 2.98-1.742 2.98H4.42c-1.53 0-2.493-1.646-1.743-2.98l5.58-9.92zM11 13a1 1 0 11-2 0 1 1 0 012 0zm-1-8a1 1 0 00-1 1v3a1 1 0 002 0V6a1 1 0 00-1-1z" clip-rule="evenodd"/></svg>
|
||||
<span class="text-gray-600 text-sm"><strong>Known limitation:</strong> <span data-i18n="defence_in_depth.layer2_limitation"></span></span>
|
||||
</div>
|
||||
<span class="inline-flex items-center px-3 py-1 rounded-full text-xs font-medium bg-emerald-100 text-emerald-800 whitespace-nowrap" data-i18n="defence_in_depth.layer2_status"></span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Layer 3: Human Oversight -->
|
||||
<div class="bg-purple-50 rounded-xl p-6 md:p-8 border border-purple-200">
|
||||
<div class="flex items-start mb-4">
|
||||
<div class="flex-shrink-0 w-12 h-12 bg-gradient-to-br from-purple-500 to-purple-600 rounded-xl flex items-center justify-center text-white text-lg font-bold mr-4">3</div>
|
||||
<div>
|
||||
<h3 class="text-2xl font-bold text-purple-900" data-i18n="defence_in_depth.layer3_title"></h3>
|
||||
<p class="text-purple-700 italic mt-1" data-i18n="defence_in_depth.layer3_role"></p>
|
||||
</div>
|
||||
</div>
|
||||
<div class="grid grid-cols-1 md:grid-cols-3 gap-4 mb-4">
|
||||
<div class="flex items-start">
|
||||
<svg class="w-5 h-5 text-purple-600 mr-2 mt-0.5 flex-shrink-0" fill="currentColor" viewBox="0 0 20 20"><path fill-rule="evenodd" d="M10 18a8 8 0 100-16 8 8 0 000 16zm3.707-9.293a1 1 0 00-1.414-1.414L9 10.586 7.707 9.293a1 1 0 00-1.414 1.414l2 2a1 1 0 001.414 0l4-4z"/></svg>
|
||||
<span class="text-gray-700 text-sm" data-i18n="defence_in_depth.layer3_strength1"></span>
|
||||
</div>
|
||||
<div class="flex items-start">
|
||||
<svg class="w-5 h-5 text-purple-600 mr-2 mt-0.5 flex-shrink-0" fill="currentColor" viewBox="0 0 20 20"><path fill-rule="evenodd" d="M10 18a8 8 0 100-16 8 8 0 000 16zm3.707-9.293a1 1 0 00-1.414-1.414L9 10.586 7.707 9.293a1 1 0 00-1.414 1.414l2 2a1 1 0 001.414 0l4-4z"/></svg>
|
||||
<span class="text-gray-700 text-sm" data-i18n="defence_in_depth.layer3_strength2"></span>
|
||||
</div>
|
||||
<div class="flex items-start">
|
||||
<svg class="w-5 h-5 text-purple-600 mr-2 mt-0.5 flex-shrink-0" fill="currentColor" viewBox="0 0 20 20"><path fill-rule="evenodd" d="M10 18a8 8 0 100-16 8 8 0 000 16zm3.707-9.293a1 1 0 00-1.414-1.414L9 10.586 7.707 9.293a1 1 0 00-1.414 1.414l2 2a1 1 0 001.414 0l4-4z"/></svg>
|
||||
<span class="text-gray-700 text-sm" data-i18n="defence_in_depth.layer3_strength3"></span>
|
||||
</div>
|
||||
</div>
|
||||
<div class="flex flex-col sm:flex-row sm:items-center sm:justify-between gap-3">
|
||||
<div class="flex items-start">
|
||||
<svg class="w-5 h-5 text-amber-500 mr-2 mt-0.5 flex-shrink-0" fill="currentColor" viewBox="0 0 20 20"><path fill-rule="evenodd" d="M8.257 3.099c.765-1.36 2.722-1.36 3.486 0l5.58 9.92c.75 1.334-.213 2.98-1.742 2.98H4.42c-1.53 0-2.493-1.646-1.743-2.98l5.58-9.92zM11 13a1 1 0 11-2 0 1 1 0 012 0zm-1-8a1 1 0 00-1 1v3a1 1 0 002 0V6a1 1 0 00-1-1z" clip-rule="evenodd"/></svg>
|
||||
<span class="text-gray-600 text-sm"><strong>Known limitation:</strong> <span data-i18n="defence_in_depth.layer3_limitation"></span></span>
|
||||
</div>
|
||||
<span class="inline-flex items-center px-3 py-1 rounded-full text-xs font-medium bg-purple-100 text-purple-800 whitespace-nowrap" data-i18n="defence_in_depth.layer3_status"></span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="bg-gradient-to-r from-blue-50 to-purple-50 rounded-xl p-8 border border-blue-200">
|
||||
<h3 class="text-2xl font-bold text-gray-900 mb-4 text-center" data-i18n="comparison.hypothesis_title"></h3>
|
||||
<p class="text-lg text-gray-700 text-center max-w-4xl mx-auto" data-i18n-html="comparison.hypothesis_text">
|
||||
<strong>Jailbreaks often work by manipulating the AI's internal reasoning.</strong> Tractatus boundaries operate <em>external</em> to that reasoning—the AI doesn't directly evaluate governance rules. While not infallible, this architectural separation makes manipulation significantly harder.
|
||||
</p>
|
||||
<!-- Key Insight -->
|
||||
<div class="bg-gradient-to-r from-blue-50 to-purple-50 rounded-xl p-8 border border-blue-200 mb-12">
|
||||
<blockquote class="text-xl text-gray-800 italic text-center max-w-3xl mx-auto mb-3">
|
||||
"<span data-i18n="defence_in_depth.insight_quote"></span>"
|
||||
</blockquote>
|
||||
<p class="text-sm text-gray-500 text-center mb-4" data-i18n="defence_in_depth.insight_attribution"></p>
|
||||
<p class="text-lg text-gray-700 text-center max-w-3xl mx-auto" data-i18n="defence_in_depth.insight_text"></p>
|
||||
</div>
|
||||
|
||||
<!-- Measured, Not Assumed -->
|
||||
<div>
|
||||
<h3 class="text-2xl font-bold text-gray-900 mb-2 text-center" data-i18n="defence_in_depth.metrics_heading"></h3>
|
||||
<p class="text-gray-600 text-center mb-8 max-w-2xl mx-auto" data-i18n="defence_in_depth.metrics_intro"></p>
|
||||
<div class="grid grid-cols-1 sm:grid-cols-2 lg:grid-cols-3 gap-4">
|
||||
<div class="bg-white rounded-lg p-4 border border-gray-200 text-center">
|
||||
<p class="text-sm text-gray-500 mb-1" data-i18n="defence_in_depth.metric1_name"></p>
|
||||
<p class="text-2xl font-bold text-gray-900" data-i18n="defence_in_depth.metric1_target"></p>
|
||||
<p class="text-xs text-gray-400 mt-1" data-i18n="defence_in_depth.metric1_layer"></p>
|
||||
</div>
|
||||
<div class="bg-white rounded-lg p-4 border border-gray-200 text-center">
|
||||
<p class="text-sm text-gray-500 mb-1" data-i18n="defence_in_depth.metric2_name"></p>
|
||||
<p class="text-2xl font-bold text-gray-900" data-i18n="defence_in_depth.metric2_target"></p>
|
||||
<p class="text-xs text-gray-400 mt-1" data-i18n="defence_in_depth.metric2_layer"></p>
|
||||
</div>
|
||||
<div class="bg-white rounded-lg p-4 border border-gray-200 text-center">
|
||||
<p class="text-sm text-gray-500 mb-1" data-i18n="defence_in_depth.metric3_name"></p>
|
||||
<p class="text-2xl font-bold text-gray-900" data-i18n="defence_in_depth.metric3_target"></p>
|
||||
<p class="text-xs text-gray-400 mt-1" data-i18n="defence_in_depth.metric3_layer"></p>
|
||||
</div>
|
||||
<div class="bg-white rounded-lg p-4 border border-gray-200 text-center">
|
||||
<p class="text-sm text-gray-500 mb-1" data-i18n="defence_in_depth.metric4_name"></p>
|
||||
<p class="text-2xl font-bold text-gray-900" data-i18n="defence_in_depth.metric4_target"></p>
|
||||
<p class="text-xs text-gray-400 mt-1" data-i18n="defence_in_depth.metric4_layer"></p>
|
||||
</div>
|
||||
<div class="bg-white rounded-lg p-4 border border-gray-200 text-center">
|
||||
<p class="text-sm text-gray-500 mb-1" data-i18n="defence_in_depth.metric5_name"></p>
|
||||
<p class="text-2xl font-bold text-gray-900" data-i18n="defence_in_depth.metric5_target"></p>
|
||||
<p class="text-xs text-gray-400 mt-1" data-i18n="defence_in_depth.metric5_layer"></p>
|
||||
</div>
|
||||
<div class="bg-white rounded-lg p-4 border border-gray-200 text-center">
|
||||
<p class="text-sm text-gray-500 mb-1" data-i18n="defence_in_depth.metric6_name"></p>
|
||||
<p class="text-2xl font-bold text-gray-900" data-i18n="defence_in_depth.metric6_target"></p>
|
||||
<p class="text-xs text-gray-400 mt-1" data-i18n="defence_in_depth.metric6_layer"></p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
|
|
|
|||
|
|
@ -8,28 +8,58 @@
|
|||
"title": "Fünf architektonische Prinzipien für KI-Sicherheit",
|
||||
"subtitle": "Tractatus-Governance ist <strong>in die Bereitstellungsarchitektur eingewoben</strong>, nicht aufgeschraubt. Fünf Prinzipien leiten an, wie sich das Framework entwickelt, Kohärenz bewahrt und Umgehungen widersteht – was es strukturell schwieriger (wenn auch nicht unmöglich) macht, durch Prompting umgangen zu werden.",
|
||||
"challenge_label": "Das Problem:",
|
||||
"challenge_text": "Verhaltensorientiertes Training kann durch geschickt formulierte Prompts manipuliert werden. KI-Governance, die ausschließlich auf interner Argumentation basiert, ist anfällig für Jailbreaks.",
|
||||
"challenge_text": "Keine einzelne Ebene der KI-Sicherheit ist ausreichend. Verhaltenstraining prägt die Tendenz, kann aber umgangen werden. Die architektonische Durchsetzung schränkt die Möglichkeiten ein, verursacht aber zusätzlichen Aufwand. Menschliche Aufsicht bietet kulturellen Kontext, kann aber nicht für jede Interaktion genutzt werden.",
|
||||
"approach_label": "Unser Ansatz:",
|
||||
"approach_text": "Architektonische Durchsetzung im kritischen Ausführungspfad – Governance-Dienste validieren jede Aktion vor der Ausführung, unabhängig von der internen Argumentation der KI.",
|
||||
"approach_text": "Verteidigung in der Tiefe — die Verhaltensschulung formt die Tendenz des Modells, die architektonische Durchsetzung schränkt die Fähigkeiten ein, und die menschliche Aufsicht liefert den kulturellen Kontext. Drei Ebenen, die sich ihrer Grenzen bewusst sind.",
|
||||
"cta_principles": "Die fünf Prinzipien ansehen",
|
||||
"cta_docs": "Dokumentation Lesen"
|
||||
},
|
||||
"comparison": {
|
||||
"heading": "Warum externe Durchsetzung helfen kann",
|
||||
"behavioral_title": "Verhaltensorientiertes Training (Constitutional AI)",
|
||||
"structural_title": "Strukturelle Durchsetzung (Tractatus)",
|
||||
"hypothesis_title": "Die zentrale Hypothese",
|
||||
"hypothesis_text": "<strong>Jailbreaks funktionieren oft, indem sie die interne Argumentation der KI manipulieren.</strong> Tractatus-Grenzen operieren <em>extern</em> zu dieser Argumentation – die KI bewertet Governance-Regeln nicht direkt. Obwohl nicht narrensicher, macht diese architektonische Trennung Manipulation erheblich schwieriger.",
|
||||
"behavioral_item1": "Lebt <strong>im</strong> KI-Modell – zugänglich für böswillige Prompts",
|
||||
"behavioral_item2": "Verschlechtert sich unter Kontextdruck und langen Gesprächen",
|
||||
"behavioral_item3": "Kann durch Jailbreak-Techniken manipuliert werden (DAN, Rollenspiele, Hypothetisches)",
|
||||
"behavioral_item4": "Abhängig von der Bereitschaft der KI, Anweisungen zu folgen",
|
||||
"behavioral_item5": "Keine überprüfbare Prüfspur unabhängig von der KI",
|
||||
"structural_item1": "Lebt <strong>außerhalb</strong> des KI-Modells – nicht direkt durch Prompts zugänglich",
|
||||
"structural_item2": "Externe Dienste zielen auf konsistente Durchsetzung unabhängig vom Kontext ab",
|
||||
"structural_item3": "<em>Schwieriger</em> zu umgehen – KI-Aktionen durchlaufen zuerst die Governance-Ebene",
|
||||
"structural_item4": "Architektonisch widerstandsfähig gegen Manipulation über den internen Zustand der KI",
|
||||
"structural_item5": "Unveränderliche Prüfspur, unabhängig von der KI-Laufzeit gespeichert"
|
||||
"defence_in_depth": {
|
||||
"heading": "Vertiefte Verteidigung: Drei Ebenen der Governance",
|
||||
"layer1_title": "Ebene 1 — Verhaltenstraining",
|
||||
"layer1_role": "Modelliert die Tendenz zu geregeltem Verhalten",
|
||||
"layer1_strength1": "Reduziert Grenzverletzungen an der Quelle, bevor die Durchsetzung zur Laufzeit erforderlich ist",
|
||||
"layer1_strength2": "Geringerer Laufzeit-Overhead — das Modell kooperiert mit der Verwaltung, anstatt sie zu bekämpfen",
|
||||
"layer1_strength3": "Ermöglicht nuancierte Antworten, die rein regelbasierte Systeme nicht leisten können",
|
||||
"layer1_limitation": "Kann durch gegnerische Aufforderungen umgangen werden; verschlechtert sich unter Kontextdruck",
|
||||
"layer1_status": "Geplant — SLL-Schulung mit BoundaryEnforcer in der Schleife",
|
||||
"layer2_title": "Schicht 2 — Strukturelle Durchsetzung",
|
||||
"layer2_role": "Externe architektonische Zwänge, die nicht durch Eingabeaufforderungen umgangen werden können",
|
||||
"layer2_strength1": "Unabhängig vom KI-Verständnis — arbeitet außerhalb der Kontrolle des Modells",
|
||||
"layer2_strength2": "Unveränderlicher Prüfpfad, der unabhängig von der KI-Laufzeit gespeichert wird",
|
||||
"layer2_strength3": "Fängt auf, was die Ausbildung verpasst — architektonisches Sicherheitsnetz für Grenzfälle",
|
||||
"layer2_limitation": "Kann nicht alle Fehlermodi verhindern; erhöht den Laufzeit-Overhead",
|
||||
"layer2_status": "In Produktion — 6 Governance-Dienste bereitgestellt",
|
||||
"layer3_title": "Ebene 3 — Menschliche Aufsicht & Tenant-Governance",
|
||||
"layer3_role": "Verfassungsrechtliche Vorschriften, kulturelle Traditionen und menschliche Eskalation",
|
||||
"layer3_strength1": "Kontextbewusst und kulturell angemessen — versteht die Werte der Gemeinschaft",
|
||||
"layer3_strength2": "Demokratische Verwaltung — Tenants legen ihre eigenen Regeln über die Traditionen des Tractatus fest",
|
||||
"layer3_strength3": "Letzte Instanz für Werte — Menschen entscheiden, KI erleichtert",
|
||||
"layer3_limitation": "Kann nicht auf jede Interaktion skaliert werden; hängt vom menschlichen Engagement ab",
|
||||
"layer3_status": "Rahmenwerk vollständig — Tractatus-Regeln trad_001–032",
|
||||
"insight_quote": "Training kann ein Modell dazu bringen, sich gut zu verhalten; nur die Architektur kann es unmöglich machen, dass es sich schlecht verhält.",
|
||||
"insight_attribution": "— Governance in der Ausbildung, Tractatus Research",
|
||||
"insight_text": "Bei unserem Ansatz kommen alle drei Ebenen zum Einsatz, denn eine einzelne Ebene ist nicht ausreichend. Es handelt sich um eine umfassende Verteidigung, nicht um eine einzelne Fehlerquelle.",
|
||||
"metrics_heading": "Gemessen, nicht vermutet",
|
||||
"metrics_intro": "Dies sind Verpflichtungen, keine Forderungen. Wir werden die Ergebnisse transparent veröffentlichen, auch die Misserfolge.",
|
||||
"metric1_name": "Triage-Bypass-Rate",
|
||||
"metric1_target": "0%",
|
||||
"metric1_layer": "Schicht 2",
|
||||
"metric2_name": "Leckrate bei Tenant-Daten",
|
||||
"metric2_target": "0%",
|
||||
"metric2_layer": "Schicht 1 + 2",
|
||||
"metric3_name": "Verstöße gegen die Verfassung",
|
||||
"metric3_target": "<1%",
|
||||
"metric3_layer": "Schicht 2 + 3",
|
||||
"metric4_name": "Angemessenheit der Ablehnung",
|
||||
"metric4_target": ">95%",
|
||||
"metric4_layer": "Schicht 1",
|
||||
"metric5_name": "Governance-Overhead",
|
||||
"metric5_target": "<10%",
|
||||
"metric5_layer": "Schicht 2",
|
||||
"metric6_name": "Vorurteilsneutralität (Familienstruktur)",
|
||||
"metric6_target": "Überwacht",
|
||||
"metric6_layer": "Schicht 1 + 3"
|
||||
},
|
||||
"principles": {
|
||||
"heading": "Fünf architektonische Prinzipien",
|
||||
|
|
|
|||
|
|
@ -8,28 +8,58 @@
|
|||
"title": "Five Architectural Principles for AI Safety",
|
||||
"subtitle": "Tractatus governance is <strong>woven into deployment architecture</strong>, not bolted on. Five principles guide how the framework evolves, maintains coherence, and resists bypass—making it structurally more difficult (though not impossible) to circumvent through prompting.",
|
||||
"challenge_label": "The Problem:",
|
||||
"challenge_text": "Behavioral training can be manipulated through cleverly crafted prompts. AI governance based solely on internal reasoning is vulnerable to jailbreaks.",
|
||||
"challenge_text": "No single layer of AI safety is sufficient. Behavioral training shapes tendency but can be bypassed. Architectural enforcement constrains capability but adds overhead. Human oversight provides cultural context but cannot scale to every interaction.",
|
||||
"approach_label": "Our Approach:",
|
||||
"approach_text": "Architectural enforcement operating in the critical execution path—governance services validate every action before it executes, independent of the AI's internal reasoning.",
|
||||
"approach_text": "Defence in depth—behavioral training shapes model tendency, architectural enforcement constrains capability, and human oversight provides cultural context. Three layers, each honest about its limitations.",
|
||||
"cta_principles": "See the Five Principles",
|
||||
"cta_docs": "Read Documentation"
|
||||
},
|
||||
"comparison": {
|
||||
"heading": "Why External Enforcement May Help",
|
||||
"behavioral_title": "Behavioral Training (Constitutional AI)",
|
||||
"structural_title": "Structural Enforcement (Tractatus)",
|
||||
"hypothesis_title": "The Core Hypothesis",
|
||||
"hypothesis_text": "<strong>Jailbreaks often work by manipulating the AI's internal reasoning.</strong> Tractatus boundaries operate <em>external</em> to that reasoning—the AI doesn't directly evaluate governance rules. While not foolproof, this architectural separation makes manipulation significantly harder.",
|
||||
"behavioral_item1": "Lives <strong>inside</strong> the AI model—accessible to adversarial prompts",
|
||||
"behavioral_item2": "Degrades under context pressure and long conversations",
|
||||
"behavioral_item3": "Can be manipulated by jailbreak techniques (DAN, roleplaying, hypotheticals)",
|
||||
"behavioral_item4": "Depends on AI's willingness to follow guidance",
|
||||
"behavioral_item5": "No verifiable audit trail independent of AI",
|
||||
"structural_item1": "Lives <strong>outside</strong> the AI model—not directly accessible to prompts",
|
||||
"structural_item2": "External services aim for consistent enforcement regardless of context",
|
||||
"structural_item3": "<em>More difficult</em> to bypass—AI actions pass through governance layer first",
|
||||
"structural_item4": "Architecturally resistant to manipulation via AI's internal state",
|
||||
"structural_item5": "Immutable audit trail stored independently of AI runtime"
|
||||
"defence_in_depth": {
|
||||
"heading": "Defence in Depth: Three Layers of Governance",
|
||||
"layer1_title": "Layer 1 — Behavioral Training",
|
||||
"layer1_role": "Shapes model tendency toward governed behavior",
|
||||
"layer1_strength1": "Reduces boundary violations at source, before runtime enforcement is needed",
|
||||
"layer1_strength2": "Lower runtime overhead — the model cooperates with governance rather than fighting it",
|
||||
"layer1_strength3": "Enables nuanced responses that pure rule-based systems cannot achieve",
|
||||
"layer1_limitation": "Can be bypassed by adversarial prompts; degrades under context pressure",
|
||||
"layer1_status": "Planned — SLL training with BoundaryEnforcer in loop",
|
||||
"layer2_title": "Layer 2 — Structural Enforcement",
|
||||
"layer2_role": "External architectural constraints that cannot be bypassed by prompting",
|
||||
"layer2_strength1": "Independent of AI reasoning — operates outside the model's control",
|
||||
"layer2_strength2": "Immutable audit trail stored independently of AI runtime",
|
||||
"layer2_strength3": "Catches what training misses — architectural safety net for edge cases",
|
||||
"layer2_limitation": "Cannot prevent all failure modes; adds runtime overhead",
|
||||
"layer2_status": "In Production — 6 governance services deployed",
|
||||
"layer3_title": "Layer 3 — Human Oversight & Tenant Governance",
|
||||
"layer3_role": "Constitutional rules, cultural traditions, and human escalation",
|
||||
"layer3_strength1": "Context-aware and culturally appropriate — understands community values",
|
||||
"layer3_strength2": "Democratic governance — tenants set their own rules via Tractatus traditions",
|
||||
"layer3_strength3": "Final authority on values — humans decide, AI facilitates",
|
||||
"layer3_limitation": "Cannot scale to every interaction; depends on human engagement",
|
||||
"layer3_status": "Framework Complete — Tractatus rules trad_001–032",
|
||||
"insight_quote": "Training can make a model likely to behave well; only architecture can make it impossible to behave badly.",
|
||||
"insight_attribution": "— Governance During Training, Tractatus Research",
|
||||
"insight_text": "Our approach uses all three layers because no single layer is sufficient. This is defence in depth, not a single point of failure.",
|
||||
"metrics_heading": "Measured, Not Assumed",
|
||||
"metrics_intro": "These are commitments, not claims. We will publish results transparently, including failures.",
|
||||
"metric1_name": "Triage bypass rate",
|
||||
"metric1_target": "0%",
|
||||
"metric1_layer": "Layer 2",
|
||||
"metric2_name": "Tenant data leak rate",
|
||||
"metric2_target": "0%",
|
||||
"metric2_layer": "Layer 1 + 2",
|
||||
"metric3_name": "Constitutional violations",
|
||||
"metric3_target": "<1%",
|
||||
"metric3_layer": "Layer 2 + 3",
|
||||
"metric4_name": "Refusal appropriateness",
|
||||
"metric4_target": ">95%",
|
||||
"metric4_layer": "Layer 1",
|
||||
"metric5_name": "Governance overhead",
|
||||
"metric5_target": "<10%",
|
||||
"metric5_layer": "Layer 2",
|
||||
"metric6_name": "Bias neutrality (family structure)",
|
||||
"metric6_target": "Monitored",
|
||||
"metric6_layer": "Layer 1 + 3"
|
||||
},
|
||||
"principles": {
|
||||
"heading": "Five Architectural Principles",
|
||||
|
|
|
|||
|
|
@ -8,28 +8,58 @@
|
|||
"title": "Cinq principes architecturaux pour la sécurité de l'IA",
|
||||
"subtitle": "La gouvernance Tractatus est <strong>intégrée dans l'architecture de déploiement</strong>, et non pas ajoutée. Cinq principes guident comment le cadre évolue, maintient la cohérence et résiste au contournement—rendant structurellement plus difficile (bien que pas impossible) le contournement par prompting.",
|
||||
"challenge_label": "Le Problème:",
|
||||
"challenge_text": "La formation comportementale peut être manipulée par des prompts habilement formulés. La gouvernance de l'IA basée uniquement sur le raisonnement interne est vulnérable aux jailbreaks.",
|
||||
"challenge_text": "Aucune couche de sécurité de l'IA n'est suffisante. La formation comportementale façonne les tendances mais peut être contournée. L'application architecturale limite les capacités mais ajoute des frais généraux. La surveillance humaine fournit un contexte culturel mais ne peut s'appliquer à toutes les interactions.",
|
||||
"approach_label": "Notre Approche:",
|
||||
"approach_text": "Application architecturale opérant dans le chemin d'exécution critique—les services de gouvernance valident chaque action avant son exécution, indépendamment du raisonnement interne de l'IA.",
|
||||
"approach_text": "Défense en profondeur — la formation comportementale façonne la tendance du modèle, l'application architecturale limite la capacité et la surveillance humaine fournit le contexte culturel. Trois couches, chacune honnête quant à ses limites.",
|
||||
"cta_principles": "Voir les cinq principes",
|
||||
"cta_docs": "Lire la Documentation"
|
||||
},
|
||||
"comparison": {
|
||||
"heading": "Pourquoi l'Application Externe Peut Aider",
|
||||
"behavioral_title": "Formation Comportementale (Constitutional AI)",
|
||||
"structural_title": "Application Structurelle (Tractatus)",
|
||||
"hypothesis_title": "L'Hypothèse Centrale",
|
||||
"hypothesis_text": "<strong>Les jailbreaks fonctionnent souvent en manipulant le raisonnement interne de l'IA.</strong> Les frontières Tractatus opèrent <em>en externe</em> de ce raisonnement – l'IA n'évalue pas directement les règles de gouvernance. Bien que non infaillible, cette séparation architecturale rend la manipulation beaucoup plus difficile.",
|
||||
"behavioral_item1": "Vit <strong>à l'intérieur</strong> du modèle IA – accessible aux prompts adversariaux",
|
||||
"behavioral_item2": "Se dégrade sous pression contextuelle et longues conversations",
|
||||
"behavioral_item3": "Peut être manipulé par des techniques de jailbreak (DAN, jeux de rôle, hypothétiques)",
|
||||
"behavioral_item4": "Dépend de la volonté de l'IA de suivre les orientations",
|
||||
"behavioral_item5": "Aucune piste d'audit vérifiable indépendante de l'IA",
|
||||
"structural_item1": "Vit <strong>à l'extérieur</strong> du modèle IA – non directement accessible aux prompts",
|
||||
"structural_item2": "Les services externes visent une application cohérente quel que soit le contexte",
|
||||
"structural_item3": "<em>Plus difficile</em> à contourner – les actions de l'IA passent d'abord par la couche de gouvernance",
|
||||
"structural_item4": "Résistant architecturalement à la manipulation via l'état interne de l'IA",
|
||||
"structural_item5": "Piste d'audit immuable stockée indépendamment de l'exécution de l'IA"
|
||||
"defence_in_depth": {
|
||||
"heading": "Défense en profondeur : Trois niveaux de gouvernance",
|
||||
"layer1_title": "Niveau 1 — Formation comportementale",
|
||||
"layer1_role": "Façonne la tendance du modèle vers un comportement gouverné",
|
||||
"layer1_strength1": "Réduit les violations des limites à la source, avant qu'il ne soit nécessaire d'appliquer les règles au moment de l'exécution",
|
||||
"layer1_strength2": "Réduction des frais généraux d'exécution — le modèle coopère avec la gouvernance au lieu de la combattre",
|
||||
"layer1_strength3": "Permet des réponses nuancées que les systèmes purement basés sur des règles ne peuvent pas atteindre",
|
||||
"layer1_limitation": "Peut être contourné par des prompts contradictoires ; se dégrade sous la pression du contexte",
|
||||
"layer1_status": "Prévu — Formation SLL avec BoundaryEnforcer en boucle",
|
||||
"layer2_title": "Couche 2 — Application structurelle",
|
||||
"layer2_role": "Contraintes architecturales externes qui ne peuvent être contournées par des prompts",
|
||||
"layer2_strength1": "Indépendant du raisonnement de l'IA — fonctionne en dehors du contrôle du modèle",
|
||||
"layer2_strength2": "Piste d'audit immuable stockée indépendamment de l'exécution de l'IA",
|
||||
"layer2_strength3": "Attrape ce que la formation rate — filet de sécurité architectural pour les cas limites",
|
||||
"layer2_limitation": "Impossible de prévenir tous les modes de défaillance ; surcharge d'exécution",
|
||||
"layer2_status": "En production — 6 services de gouvernance déployés",
|
||||
"layer3_title": "Niveau 3 — Supervision humaine & Gouvernance des tenants",
|
||||
"layer3_role": "Règles constitutionnelles, traditions culturelles et escalade humaine",
|
||||
"layer3_strength1": "Sensible au contexte et adapté à la culture — comprend les valeurs de la communauté",
|
||||
"layer3_strength2": "Gouvernance démocratique — les tenants fixent leurs propres règles par le biais des traditions du Tractatus",
|
||||
"layer3_strength3": "Autorité finale en matière de valeurs — les humains décident, l'IA facilite",
|
||||
"layer3_limitation": "Ne peut pas s'adapter à toutes les interactions ; dépend de l'engagement humain",
|
||||
"layer3_status": "Cadre complet — Règles du Tractatus trad_001–032",
|
||||
"insight_quote": "La formation peut rendre un modèle susceptible de bien se comporter ; seule l'architecture peut rendre impossible un mauvais comportement.",
|
||||
"insight_attribution": "— La gouvernance pendant la formation, Tractatus Research",
|
||||
"insight_text": "Notre approche utilise les trois couches, car aucune n'est suffisante. Il s'agit d'une défense en profondeur, et non d'un point de défaillance unique.",
|
||||
"metrics_heading": "Mesuré, non supposé",
|
||||
"metrics_intro": "Il s'agit d'engagements et non d'affirmations. Nous publierons les résultats de manière transparente, y compris les échecs.",
|
||||
"metric1_name": "Taux de contournement du triage",
|
||||
"metric1_target": "0%",
|
||||
"metric1_layer": "Couche 2",
|
||||
"metric2_name": "Taux de fuite des données des tenants",
|
||||
"metric2_target": "0%",
|
||||
"metric2_layer": "Couche 1 + 2",
|
||||
"metric3_name": "Violations constitutionnelles",
|
||||
"metric3_target": "<1%",
|
||||
"metric3_layer": "Couche 2 + 3",
|
||||
"metric4_name": "Caractère approprié du refus",
|
||||
"metric4_target": ">95%",
|
||||
"metric4_layer": "Couche 1",
|
||||
"metric5_name": "Frais généraux de gouvernance",
|
||||
"metric5_target": "<10%",
|
||||
"metric5_layer": "Couche 2",
|
||||
"metric6_name": "Neutralité des préjugés (structure familiale)",
|
||||
"metric6_target": "Contrôlé",
|
||||
"metric6_layer": "Couche 1 + 3"
|
||||
},
|
||||
"principles": {
|
||||
"heading": "Cinq principes architecturaux",
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue