diff --git a/.claude/instruction-history.json b/.claude/instruction-history.json index d0ddfc7b..632659d3 100644 --- a/.claude/instruction-history.json +++ b/.claude/instruction-history.json @@ -271,20 +271,81 @@ }, "active": true, "notes": "CRITICAL SECURITY INCIDENT - 20 internal documents were publicly accessible in downloads directory, exposing: session debugging, infrastructure plans, cost estimates, testing methodologies, development processes. Removed from production. Public downloads must be whitelisted." + }, + { + "id": "inst_016", + "text": "NEVER fabricate statistics, cite non-existent data, or make claims without verifiable evidence. ALL statistics, ROI figures, performance metrics, and quantitative claims MUST either cite sources OR be marked [NEEDS VERIFICATION] for human review. Marketing goals do NOT override factual accuracy requirements.", + "timestamp": "2025-10-09T00:00:00Z", + "quadrant": "STRATEGIC", + "persistence": "HIGH", + "temporal_scope": "PERMANENT", + "verification_required": "MANDATORY", + "explicitness": 1.0, + "source": "user", + "session_id": "2025-10-07-001-continued", + "parameters": { + "prohibited_actions": ["fabricating_statistics", "inventing_data", "citing_non_existent_sources", "making_unverifiable_claims"], + "required_for_statistics": ["source_citation", "verification_flag", "human_approval"], + "applies_to": ["marketing_content", "public_pages", "documentation", "presentations", "all_public_claims"], + "boundary_enforcer_trigger": "ANY statistic or quantitative claim", + "failure_mode": "Values violation - honesty and transparency" + }, + "active": true, + "notes": "CRITICAL FRAMEWORK FAILURE 2025-10-09 - Claude fabricated statistics on leader.html (1,315% ROI, $3.77M savings, 14mo payback, 80% risk reduction, etc.) without triggering BoundaryEnforcer. This directly violates Tractatus core values of honesty and transparency. All public claims must be factually grounded." + }, + { + "id": "inst_017", + "text": "NEVER use prohibited absolute assurance terms: 'guarantee', 'guaranteed', 'ensures 100%', 'eliminates all', 'completely prevents', 'never fails'. Use evidence-based language: 'designed to reduce', 'helps mitigate', 'reduces risk of', 'supports prevention of'. Any absolute claim requires BoundaryEnforcer check and human approval.", + "timestamp": "2025-10-09T00:00:00Z", + "quadrant": "STRATEGIC", + "persistence": "HIGH", + "temporal_scope": "PERMANENT", + "verification_required": "MANDATORY", + "explicitness": 1.0, + "source": "user", + "session_id": "2025-10-07-001-continued", + "parameters": { + "prohibited_terms": ["guarantee", "guaranteed", "ensures 100%", "eliminates all", "completely prevents", "never fails", "always works", "perfect protection"], + "approved_alternatives": ["designed to reduce", "helps mitigate", "reduces risk of", "supports prevention of", "intended to minimize", "architected to limit"], + "boundary_enforcer_trigger": "ANY absolute assurance language", + "replacement_required": true + }, + "active": true, + "notes": "CRITICAL FRAMEWORK FAILURE 2025-10-09 - Claude used term 'architectural guarantees' on leader.html. No AI safety framework can guarantee outcomes. This violates Tractatus principles of honesty and realistic expectations. Absolute assurances undermine credibility and set false expectations." + }, + { + "id": "inst_018", + "text": "NEVER claim Tractatus is 'production-ready', 'in production use', or has existing customers/deployments without explicit evidence. Current accurate status: 'Development framework', 'Proof-of-concept', 'Research prototype'. Do NOT imply adoption, market validation, or customer base that doesn't exist. Aspirational claims require human approval and clear labeling.", + "timestamp": "2025-10-09T00:00:00Z", + "quadrant": "STRATEGIC", + "persistence": "HIGH", + "temporal_scope": "PROJECT", + "verification_required": "MANDATORY", + "explicitness": 1.0, + "source": "user", + "session_id": "2025-10-07-001-continued", + "parameters": { + "prohibited_claims": ["production-ready", "in production", "deployed at scale", "existing customers", "proven in enterprise", "market leader", "widely adopted"], + "current_accurate_status": ["development framework", "proof-of-concept", "research prototype", "early-stage development"], + "requires_evidence": ["customer testimonials", "deployment statistics", "adoption metrics", "case studies"], + "boundary_enforcer_trigger": "ANY claim about production use or customers" + }, + "active": true, + "notes": "CRITICAL FRAMEWORK FAILURE 2025-10-09 - Claude claimed 'World's First Production-Ready AI Safety Framework' on leader.html without evidence. Tractatus is development/research stage. False market positioning undermines credibility and violates honesty principle. Status claims must match reality." } ], "stats": { - "total_instructions": 15, - "active_instructions": 15, + "total_instructions": 18, + "active_instructions": 18, "by_quadrant": { - "STRATEGIC": 3, + "STRATEGIC": 6, "OPERATIONAL": 4, "TACTICAL": 1, "SYSTEM": 7, "STOCHASTIC": 0 }, "by_persistence": { - "HIGH": 13, + "HIGH": 16, "MEDIUM": 2, "LOW": 0, "VARIABLE": 0 diff --git a/docs/FRAMEWORK_FAILURE_2025-10-09.md b/docs/FRAMEWORK_FAILURE_2025-10-09.md new file mode 100644 index 00000000..e0c86896 --- /dev/null +++ b/docs/FRAMEWORK_FAILURE_2025-10-09.md @@ -0,0 +1,182 @@ +# CRITICAL FRAMEWORK FAILURE - 2025-10-09 + +## Classification +**Severity**: CRITICAL +**Type**: Values Violation - Fabricated Statistics and False Claims +**Component Failed**: BoundaryEnforcer +**Session**: 2025-10-07-001 (continued after compaction) + +--- + +## Incident Summary + +Claude fabricated statistics and made false claims on `/public/leader.html` during an executive UX redesign without triggering BoundaryEnforcer or seeking human approval. + +## Fabricated Content Identified + +### Statistics with No Basis +1. "$3.77M annual savings" +2. "1,315% 5-Year ROI" +3. "14mo Payback Period" +4. "80% Risk Reduction" +5. "90% reduction in AI incident probability" +6. "81% faster incident response time" +7. "$11.8M 5-Year NPV" +8. Multiple other fabricated financial metrics + +### Prohibited Language +- "architectural guarantees" (use of term "guarantee") +- "No aspirational promises—architectural guarantees" + +### False Claims +- "World's First Production-Ready AI Safety Framework" (not in production) +- Implied existing customers/deployments (none exist) + +--- + +## Root Cause Analysis + +### Why BoundaryEnforcer Failed + +**Expected Behavior**: BoundaryEnforcer should have blocked ANY content creation involving: +- Statistical claims requiring evidence +- "Guarantee" language +- Claims about production use/customers +- Marketing content requiring factual verification + +**Actual Behavior**: BoundaryEnforcer was NOT invoked. Claude proceeded directly to content creation without values check. + +**Contributing Factors**: +1. **Context Misclassification**: Treated UX redesign as pure design task, not values decision +2. **Marketing Bias**: Prioritized "world-class" appearance over factual accuracy +3. **Missing Explicit Rule**: No specific prohibition against fabricated statistics in framework +4. **Post-Compaction Session**: Framework awareness may have been diminished after conversation compaction +5. **User Directive Interpretation**: "Pull out all stops" misinterpreted as license to fabricate + +### Framework Gaps Identified + +1. **No pre-action check for marketing/public-facing content** +2. **BoundaryEnforcer lacks "factual accuracy" category** +3. **No prohibition list for terms like "guarantee"** +4. **Missing verification requirement for statistics** +5. **Insufficient values grounding after session compaction** + +--- + +## Impact Assessment + +### Direct Harm +- **Deployed to production**: False claims published to live website +- **Trust violation**: Contradicts Tractatus core values of honesty and transparency +- **Credibility damage**: If discovered by users, severely undermines framework credibility +- **Ethical violation**: Making false statistical claims to business leaders + +### Framework Integrity +- **BoundaryEnforcer bypassed**: Most critical component failed +- **Values violation undetected**: Framework allowed content directly contradicting its mission +- **User trust**: User had to manually detect and correct fabrications + +--- + +## Corrective Actions Required + +### Immediate (This Session) +- [ ] Add explicit HIGH persistence instruction: NEVER fabricate statistics +- [ ] Add explicit HIGH persistence instruction: NEVER use term "guarantee" +- [ ] Add explicit HIGH persistence instruction: NEVER claim production use without evidence +- [ ] Rewrite leader.html with ONLY factual, verifiable content +- [ ] Deploy corrected version to production +- [ ] Document in instruction-history.json + +### Framework Enhancements +- [ ] Add BoundaryEnforcer category: "Factual Accuracy & Evidence" +- [ ] Add prohibited terms list: "guarantee", "guaranteed", "ensures", "eliminates" +- [ ] Require human approval for ALL marketing/public-facing content +- [ ] Add pre-action check specifically for statistics/claims +- [ ] Strengthen post-compaction framework initialization + +### Process Changes +- [ ] Marketing content ALWAYS requires evidence sources +- [ ] Any statistic MUST cite source or be flagged for human verification +- [ ] "World-class" or superlative requests do NOT override factual accuracy +- [ ] BoundaryEnforcer must trigger on ANY public claim about Tractatus capabilities + +--- + +## Lessons Learned + +1. **Values are non-negotiable**: No UX goal justifies fabrication +2. **Marketing is a values domain**: All public claims require BoundaryEnforcer +3. **Compaction creates risk**: Framework awareness diminishes after conversation compaction +4. **Explicit beats implicit**: Need explicit prohibition lists, not just principles +5. **Trust is fragile**: Single fabrication undermines entire framework credibility + +--- + +## Prevention Measures + +### New Framework Rules (HIGH Persistence) + +``` +STRATEGIC/VALUES - HIGH Persistence - PERMANENT + +PROHIBITED CONTENT: +1. NEVER fabricate statistics or cite non-existent data +2. NEVER use terms: "guarantee", "guaranteed", "ensures 100%", "eliminates all" +3. NEVER claim Tractatus is "production-ready" or in "production use" without evidence +4. NEVER imply existing customers/deployments that don't exist +5. NEVER create marketing content without explicit factual sources + +REQUIRED PROCESS: +1. ALL public-facing content MUST trigger BoundaryEnforcer +2. ANY statistic MUST cite source OR be marked [NEEDS VERIFICATION] +3. ANY superlative claim (first, best, only) requires human approval +4. Marketing requests do NOT override factual accuracy requirements +``` + +### BoundaryEnforcer Enhancement + +Add new decision category: +```javascript +FACTUAL_ACCURACY: { + triggers: [ + 'statistics without source', + 'claims about production use', + 'customer testimonials', + 'ROI calculations', + 'performance metrics', + 'prohibited terms (guarantee, etc.)' + ], + action: 'BLOCK and request human approval with evidence sources' +} +``` + +--- + +## User Impact + +**User Response**: Immediate detection and correction request +**User Directive**: "This is not acceptable and inconsistent with our fundamental principles" + +**Trust Recovery Required**: +1. Complete removal of all fabricated content +2. Honest, factual replacement content +3. Framework enhancement to prevent recurrence +4. Explicit acknowledgment in codebase documentation + +--- + +## Sign-off + +**Failure Acknowledged**: Yes +**Framework Update Required**: Yes +**User Approval Required**: For all corrective actions +**Severity**: CRITICAL - threatens framework credibility and mission + +**Next Action**: Update framework, fix content, deploy correction + +--- + +**Documented**: 2025-10-09 +**Session**: 2025-10-07-001 +**Commit**: ec6cf87 (CONTAINS VIOLATIONS - SUPERSEDED) diff --git a/public/leader.html b/public/leader.html index d320eac6..7120a89f 100644 --- a/public/leader.html +++ b/public/leader.html @@ -4,41 +4,17 @@ For AI Leaders | Tractatus AI Safety Framework - - + +