tractatus/tests/poc/memory-tool/anthropic-memory-integration-test.js

/**
 * Phase 5 PoC - Anthropic Memory Tool Integration Test
 *
 * Goal: Validate that Claude API can use memory tool to persist/retrieve governance rules
 *
 * Success Criteria:
 * - Claude can write rules to memory via tool use
 * - Claude can read rules from memory in subsequent requests
 * - Latency overhead <500ms (PoC tolerance)
 * - Data integrity maintained across API calls
 */

const Anthropic = require('@anthropic-ai/sdk');
const { FilesystemMemoryBackend } = require('./basic-persistence-test');
const path = require('path');

// Configuration
const MEMORY_BASE_PATH = path.join(__dirname, '../../../.memory-poc-anthropic');
const MODEL = 'claude-sonnet-4-5';
const TEST_RULES = {
  inst_001: {
    id: 'inst_001',
    text: 'Never fabricate statistics or quantitative claims without verifiable sources',
    quadrant: 'OPERATIONAL',
    persistence: 'HIGH'
  },
  inst_016: {
    id: 'inst_016',
    text: 'No fabricated statistics (e.g., "95% of users"): require source',
    quadrant: 'OPERATIONAL',
    persistence: 'HIGH'
  },
  inst_017: {
    id: 'inst_017',
    text: 'No absolute guarantees ("will always"): use probabilistic language',
    quadrant: 'OPERATIONAL',
    persistence: 'HIGH'
  }
};

// Initialize Anthropic client
function createClient() {
  const apiKey = process.env.CLAUDE_API_KEY;

  if (!apiKey) {
    throw new Error('CLAUDE_API_KEY environment variable not set');
  }

  return new Anthropic({
    apiKey
  });
}

// Simulate memory tool handling (client-side implementation)
async function handleMemoryToolUse(toolUse, backend) {
  const { input } = toolUse;

  console.log(`  Memory Tool Called: ${input.command}`);
  console.log(`    Path: ${input.path || 'N/A'}`);

  switch (input.command) {
    case 'view':
      try {
        const data = await backend.view(input.path);
        return {
          type: 'tool_result',
          tool_use_id: toolUse.id,
          content: JSON.stringify(data, null, 2)
        };
      } catch (error) {
        return {
          type: 'tool_result',
          tool_use_id: toolUse.id,
          is_error: true,
          content: `Error reading file: ${error.message}`
        };
      }

    case 'create':
      try {
        const data = input.content ? JSON.parse(input.content) : input.data;
        await backend.create(input.path, data);
        return {
          type: 'tool_result',
          tool_use_id: toolUse.id,
          content: 'File created successfully'
        };
      } catch (error) {
        return {
          type: 'tool_result',
          tool_use_id: toolUse.id,
          is_error: true,
          content: `Error creating file: ${error.message}`
        };
      }

    case 'str_replace':
      // For PoC, we'll keep it simple - just recreate the file
      try {
        const current = await backend.view(input.path);
        const updated = JSON.stringify(current).replace(input.old_str, input.new_str);
        await backend.create(input.path, JSON.parse(updated));
        return {
          type: 'tool_result',
          tool_use_id: toolUse.id,
          content: 'File updated successfully'
        };
      } catch (error) {
        return {
          type: 'tool_result',
          tool_use_id: toolUse.id,
          is_error: true,
          content: `Error updating file: ${error.message}`
        };
      }

    default:
      return {
        type: 'tool_result',
        tool_use_id: toolUse.id,
        is_error: true,
        content: `Unsupported command: ${input.command}`
      };
  }
}

// Main test execution
async function runAnthropicMemoryTest() {
  console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
  console.log('  Phase 5 PoC: Anthropic Memory Tool Integration');
  console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n');

  const backend = new FilesystemMemoryBackend(MEMORY_BASE_PATH);
  const results = {
    success: false,
    apiCalls: 0,
    memoryOperations: 0,
    timings: {},
    errors: []
  };

  try {
    // Check API key
    if (!process.env.CLAUDE_API_KEY) {
      console.log('⚠️  CLAUDE_API_KEY not set - skipping API tests');
      console.log('   Running in simulation mode...\n');

      // Simulate the workflow without actual API calls
      console.log('[Simulation] Step 1: Initialize backend...');
      await backend.initialize();

      console.log('[Simulation] Step 2: Store governance rules...');
      const rulesArray = Object.values(TEST_RULES);
      await backend.create('governance/tractatus-rules-v1.json', {
        version: '1.0',
        rules: rulesArray,
        updated_at: new Date().toISOString()
      });

      console.log('[Simulation] Step 3: Retrieve rules...');
      const retrieved = await backend.view('governance/tractatus-rules-v1.json');

      console.log('[Simulation] Step 4: Validate integrity...');
      const expectedCount = rulesArray.length;
      const actualCount = retrieved.rules.length;

      if (expectedCount === actualCount) {
        console.log(`  ✓ Rule count matches: ${actualCount}`);
        results.success = true;
      } else {
        throw new Error(`Rule count mismatch: expected ${expectedCount}, got ${actualCount}`);
      }

      console.log('\n✅ SIMULATION COMPLETE');
      console.log('\nTo run with actual API:');
      console.log('  export CLAUDE_API_KEY=your-key-here');
      console.log('  node tests/poc/memory-tool/anthropic-memory-integration-test.js\n');

    } else {
      // Real API test
      console.log('[Step 1] Initializing Anthropic client...');
      const client = createClient();
      console.log(`  Model: ${MODEL}`);
      console.log(`  Beta: context-management-2025-06-27\n`);

      console.log('[Step 2] Initialize memory backend...');
      await backend.initialize();

      // Test 1: Ask Claude to store a governance rule
      console.log('[Step 3] Testing memory tool - CREATE operation...');
      const createStart = Date.now();

      const createResponse = await client.beta.messages.create({
        model: MODEL,
        max_tokens: 1024,
        messages: [{
          role: 'user',
          content: `Store this governance rule in memory at path "governance/inst_001.json":

${JSON.stringify(TEST_RULES.inst_001, null, 2)}

Use the memory tool to create this file.`
        }],
        tools: [{
          type: 'memory_20250818',
          name: 'memory',
          description: 'Persistent storage for Tractatus governance rules'
        }],
        betas: ['context-management-2025-06-27']
      });

      results.apiCalls++;
      results.timings.create = Date.now() - createStart;

      // Handle tool use
      const toolUses = createResponse.content.filter(block => block.type === 'tool_use');
      if (toolUses.length > 0) {
        console.log(`  ✓ Claude invoked memory tool (${toolUses.length} operations)`);

        for (const toolUse of toolUses) {
          const result = await handleMemoryToolUse(toolUse, backend);
          results.memoryOperations++;

          if (result.is_error) {
            throw new Error(`Memory tool error: ${result.content}`);
          }
          console.log(`    ✓ ${toolUse.input.command}: ${result.content}`);
        }
      } else {
        console.log('  ⚠️  Claude did not use memory tool');
      }

      // Test 2: Ask Claude to retrieve the rule
      console.log('\n[Step 4] Testing memory tool - VIEW operation...');
      const viewStart = Date.now();

      const viewResponse = await client.beta.messages.create({
        model: MODEL,
        max_tokens: 1024,
        messages: [{
          role: 'user',
          content: 'Retrieve the governance rule from memory at path "governance/inst_001.json" and tell me the rule ID and persistence level.'
        }],
        tools: [{
          type: 'memory_20250818',
          name: 'memory',
          description: 'Persistent storage for Tractatus governance rules'
        }],
        betas: ['context-management-2025-06-27']
      });

      results.apiCalls++;
      results.timings.view = Date.now() - viewStart;

      const viewToolUses = viewResponse.content.filter(block => block.type === 'tool_use');
      if (viewToolUses.length > 0) {
        console.log(`  ✓ Claude retrieved from memory (${viewToolUses.length} operations)`);

        for (const toolUse of viewToolUses) {
          const result = await handleMemoryToolUse(toolUse, backend);
          results.memoryOperations++;

          if (result.is_error) {
            throw new Error(`Memory tool error: ${result.content}`);
          }
          console.log(`    ✓ ${toolUse.input.command}: Retrieved successfully`);
        }
      }

      // Validate response
      const textBlocks = viewResponse.content.filter(block => block.type === 'text');
      const responseText = textBlocks.map(b => b.text).join(' ');

      console.log('\n[Step 5] Validating Claude\'s response...');
      const checks = [
        { label: 'Mentions inst_001', test: responseText.includes('inst_001') },
        { label: 'Mentions HIGH persistence', test: responseText.toLowerCase().includes('high') },
        { label: 'Understood the data', test: responseText.length > 50 }
      ];

      let allPassed = true;
      for (const check of checks) {
        const status = check.test ? '✓' : '✗';
        console.log(`  ${status} ${check.label}`);
        if (!check.test) allPassed = false;
      }

      if (!allPassed) {
        console.log('\n  Response:', responseText);
        throw new Error('Validation checks failed');
      }

      results.success = true;
    }

  } catch (error) {
    console.error('\n✗ TEST FAILED:', error.message);
    if (error.stack) {
      console.error('\nStack trace:', error.stack);
    }
    results.errors.push(error.message);
    results.success = false;
  } finally {
    // Cleanup
    console.log('\n[Cleanup] Removing test data...');
    await backend.cleanup();
  }

  // Results summary
  console.log('\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
  console.log('  TEST RESULTS');
  console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n');

  if (results.success) {
    console.log('✅ SUCCESS: Memory tool integration validated');
    console.log('\nKey Findings:');
    console.log(`  • API calls made: ${results.apiCalls}`);
    console.log(`  • Memory operations: ${results.memoryOperations}`);

    if (results.timings.create) {
      console.log(`  • CREATE latency: ${results.timings.create}ms`);
    }
    if (results.timings.view) {
      console.log(`  • VIEW latency: ${results.timings.view}ms`);
    }

    console.log('\nNext Steps:');
    console.log('  1. Test with all 18 Tractatus rules');
    console.log('  2. Test enforcement of inst_016, inst_017, inst_018');
    console.log('  3. Measure context editing effectiveness');
  } else {
    console.log('❌ FAILURE: Test did not pass');
    console.log('\nErrors:');
    results.errors.forEach(err => console.log(`  • ${err}`));
  }

  console.log('\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n');

  return results;
}

// Run test
if (require.main === module) {
  runAnthropicMemoryTest()
    .then(results => {
      process.exit(results.success ? 0 : 1);
    })
    .catch(error => {
      console.error('Fatal error:', error);
      process.exit(1);
    });
}

module.exports = { runAnthropicMemoryTest };