Phase B of PLAN_LICENSE_STANDARDISATION_EUPL12_20260419. Follows Phase A (c85f310f,4ddc54a0) which flipped the LICENSE file + README; this commit propagates EUPL-1.2 through source-file headers. 21 files touched across 4 distinct Apache-reference variants: - V1 (14 files) — full Apache header block (JS /* ... */): 2 routes + 1 controller + 7 services + 2 models + 3 utils. Replaced with equivalent EUPL-1.2 block pointing at EC canonical URL. - V2 (2 files) — inline JSDoc license line (Copyright Tractatus Project): src/routes/calendar.routes.js + src/models/ScheduledTask.model.js. Replaced with EUPL-1.2 v. 1.2 equivalent. - V3 (4 files) — Python docstring 'License: Apache 2.0': all 4 al-integration Python files. Replaced with 'License: EUPL-1.2'. - V4 (1 file) — al-integration/README.md bare 'Apache 2.0' under '## License' heading. Replaced with 'EUPL-1.2'. Verification: - grep -r "Apache License|Apache 2.0|apache.org/licenses" src/ al-integration/ returns zero matches (modulo venv). - Unit tests: 524/524 pass (npm run test:unit). - Integration test failures (177) are DB-connection infrastructure, pre-existing, unrelated to this header-only change. Sole author basis: TheFlow, 930+ commits, unilateral relicensing (same as Phase A). Replacement infrastructure also committed: scripts/relicense-apache-to-eupl.js (auto-detecting variant replacement, idempotent, --dry-run mode). Reusable for Phase C (community-repo sweep) if pattern structure aligns. Out-of-scope Apache mentions still in the repo (next pass, NOT Phase B): - SESSION_HANDOFF_ENFORCEMENT_COMPLETE.md (root doc) - CLAUDE_Tractatus_Maintenance_Guide.md (root doc) - For Claude Web/tractatus-claude-web-complete/** (docs snapshot subdirectory) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
381 lines
12 KiB
Python
381 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Feedback Analyzer Training Script
|
|
|
|
Trains the feedback analyzer agent to categorize and prioritize feedback.
|
|
Uses actual feedback data from MongoDB + synthetic training examples.
|
|
|
|
This is USEFUL training - helps you triage real feedback efficiently.
|
|
|
|
Usage:
|
|
python train_analyzer.py --mode setup # Setup and test
|
|
python train_analyzer.py --mode train # Run training iteration
|
|
|
|
Requirements:
|
|
- OpenAI API key or local vLLM endpoint
|
|
- MongoDB with feedback collection
|
|
- Agent Lightning 0.2.2+
|
|
|
|
License: EUPL-1.2
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import asyncio
|
|
import json
|
|
import os
|
|
from pathlib import Path
|
|
from typing import List, Dict
|
|
|
|
from pymongo import MongoClient
|
|
from rich.console import Console
|
|
from rich.table import Table
|
|
|
|
import agentlightning as agl
|
|
|
|
# Import analyzer agent
|
|
import sys
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
from agents.feedback_analyzer import (
|
|
feedback_analyzer_agent,
|
|
FeedbackTask,
|
|
FeedbackCategory,
|
|
Severity
|
|
)
|
|
|
|
console = Console()
|
|
|
|
|
|
# Form type mapping to expected categories
|
|
FORM_TYPE_HINTS = {
|
|
"bug": [FeedbackCategory.WEBSITE_BUG, FeedbackCategory.FRAMEWORK_ISSUE],
|
|
"technical_question": [FeedbackCategory.CONTENT_GAP, FeedbackCategory.FRAMEWORK_ISSUE],
|
|
"feature": [FeedbackCategory.FEATURE_REQUEST],
|
|
"general": None, # Could be anything
|
|
"research": [FeedbackCategory.POSITIVE, FeedbackCategory.FEATURE_REQUEST],
|
|
"commercial": [FeedbackCategory.NOISE], # Human handles these
|
|
}
|
|
|
|
|
|
def load_feedback_from_mongodb() -> List[FeedbackTask]:
|
|
"""
|
|
Load real feedback data from MongoDB.
|
|
|
|
Returns:
|
|
List of FeedbackTask objects from database
|
|
"""
|
|
|
|
try:
|
|
client = MongoClient(os.getenv("MONGODB_URI", "mongodb://localhost:27017/"))
|
|
db = client.tractatus_dev
|
|
feedback_collection = db.feedback
|
|
|
|
feedback_docs = list(feedback_collection.find().limit(100))
|
|
|
|
tasks = []
|
|
for doc in feedback_docs:
|
|
tasks.append(FeedbackTask(
|
|
feedback_id=str(doc.get("_id", "unknown")),
|
|
rating=doc.get("rating", 3),
|
|
comment=doc.get("comment", ""),
|
|
page=doc.get("page", "/"),
|
|
feedback_type=doc.get("type", "general"),
|
|
governance_passed=doc.get("governance_passed", True)
|
|
))
|
|
|
|
console.print(f"[green]Loaded {len(tasks)} feedback entries from MongoDB[/green]")
|
|
return tasks
|
|
|
|
except Exception as e:
|
|
console.print(f"[yellow]Could not load from MongoDB: {e}[/yellow]")
|
|
console.print("[yellow]Using synthetic data instead[/yellow]")
|
|
return []
|
|
|
|
|
|
def generate_synthetic_training_data() -> List[FeedbackTask]:
|
|
"""
|
|
Generate realistic synthetic training data.
|
|
|
|
Returns:
|
|
List of synthetic FeedbackTask objects
|
|
"""
|
|
|
|
synthetic_examples = [
|
|
# Website bugs
|
|
FeedbackTask(
|
|
feedback_id="syn_001",
|
|
rating=2,
|
|
comment="The Discord link doesn't work on mobile. Gets stuck loading.",
|
|
page="/",
|
|
feedback_type="bug"
|
|
),
|
|
FeedbackTask(
|
|
feedback_id="syn_002",
|
|
rating=1,
|
|
comment="Page loads extremely slowly. Takes 10+ seconds.",
|
|
page="/integrations/agent-lightning.html",
|
|
feedback_type="bug"
|
|
),
|
|
|
|
# Framework issues
|
|
FeedbackTask(
|
|
feedback_id="syn_003",
|
|
rating=2,
|
|
comment="BoundaryEnforcer blocks too aggressively. Can't submit legitimate feedback.",
|
|
page="/",
|
|
feedback_type="technical_question"
|
|
),
|
|
FeedbackTask(
|
|
feedback_id="syn_004",
|
|
rating=3,
|
|
comment="How do I configure the CrossReferenceValidator thresholds?",
|
|
page="/researcher.html",
|
|
feedback_type="technical_question"
|
|
),
|
|
|
|
# Content gaps
|
|
FeedbackTask(
|
|
feedback_id="syn_005",
|
|
rating=3,
|
|
comment="The installation guide assumes too much knowledge. Need more beginner-friendly docs.",
|
|
page="/implementer.html",
|
|
feedback_type="technical_question"
|
|
),
|
|
FeedbackTask(
|
|
feedback_id="syn_006",
|
|
rating=2,
|
|
comment="What's the difference between BoundaryEnforcer and CrossReferenceValidator? Docs don't explain.",
|
|
page="/researcher.html",
|
|
feedback_type="technical_question"
|
|
),
|
|
|
|
# Feature requests
|
|
FeedbackTask(
|
|
feedback_id="syn_007",
|
|
rating=4,
|
|
comment="Would love to see integration with LangChain. Is that planned?",
|
|
page="/integrations/agent-lightning.html",
|
|
feedback_type="feature"
|
|
),
|
|
FeedbackTask(
|
|
feedback_id="syn_008",
|
|
rating=3,
|
|
comment="Can you add support for custom governance rules?",
|
|
page="/implementer.html",
|
|
feedback_type="feature"
|
|
),
|
|
|
|
# Positive feedback
|
|
FeedbackTask(
|
|
feedback_id="syn_009",
|
|
rating=5,
|
|
comment="Excellent work on research transparency! Rare to see this level of honesty.",
|
|
page="/integrations/agent-lightning.html",
|
|
feedback_type="general"
|
|
),
|
|
FeedbackTask(
|
|
feedback_id="syn_010",
|
|
rating=5,
|
|
comment="This is exactly what AI governance needs. Thank you!",
|
|
page="/",
|
|
feedback_type="general"
|
|
),
|
|
|
|
# Noise/spam
|
|
FeedbackTask(
|
|
feedback_id="syn_011",
|
|
rating=1,
|
|
comment="test",
|
|
page="/",
|
|
feedback_type="general"
|
|
),
|
|
FeedbackTask(
|
|
feedback_id="syn_012",
|
|
rating=5,
|
|
comment="Great!!!",
|
|
page="/",
|
|
feedback_type="general"
|
|
),
|
|
]
|
|
|
|
console.print(f"[yellow]Generated {len(synthetic_examples)} synthetic training examples[/yellow]")
|
|
return synthetic_examples
|
|
|
|
|
|
def display_analysis_results(results: List[Dict]):
|
|
"""
|
|
Display analysis results in formatted table.
|
|
|
|
Args:
|
|
results: List of analysis result dictionaries
|
|
"""
|
|
|
|
table = Table(title="Feedback Analysis Results")
|
|
table.add_column("ID", style="cyan")
|
|
table.add_column("Rating", style="magenta")
|
|
table.add_column("Category", style="green")
|
|
table.add_column("Severity", style="yellow")
|
|
table.add_column("Priority", style="red")
|
|
table.add_column("Reward", style="blue")
|
|
|
|
for result in results:
|
|
if result["status"] == "success":
|
|
analysis = result["analysis"]
|
|
table.add_row(
|
|
result.get("feedback_id", "unknown")[:8],
|
|
str(result.get("rating", "-")),
|
|
analysis["category"],
|
|
analysis["severity"],
|
|
f"{analysis['priority']:.1f}",
|
|
f"{result['reward']:.2f}"
|
|
)
|
|
|
|
console.print(table)
|
|
|
|
|
|
def setup_test():
|
|
"""
|
|
Setup test - verify everything works without full training.
|
|
"""
|
|
|
|
console.print("[bold cyan]Feedback Analyzer Setup Test[/bold cyan]\n")
|
|
|
|
# Load or generate data
|
|
console.print("[yellow]1. Loading training data...[/yellow]")
|
|
real_feedback = load_feedback_from_mongodb()
|
|
synthetic_feedback = generate_synthetic_training_data()
|
|
|
|
dataset = real_feedback if real_feedback else synthetic_feedback
|
|
|
|
console.print(f"[green]✓ Training dataset ready: {len(dataset)} examples[/green]\n")
|
|
|
|
# Test analyzer with one example
|
|
console.print("[yellow]2. Testing analyzer agent...[/yellow]")
|
|
test_task = dataset[0]
|
|
|
|
console.print(f" Feedback: \"{test_task.comment[:60]}...\"")
|
|
console.print(f" Rating: {test_task.rating}/5")
|
|
console.print(f" Type: {test_task.feedback_type}")
|
|
console.print(f" Page: {test_task.page}")
|
|
console.print()
|
|
|
|
# Note: Actual analysis requires LLM endpoint
|
|
console.print("[green]✓ Analyzer agent code loaded successfully[/green]\n")
|
|
|
|
# Display configuration
|
|
console.print("[yellow]3. Configuration:[/yellow]")
|
|
console.print(f" Dataset size: {len(dataset)}")
|
|
console.print(f" Agent: feedback_analyzer_agent")
|
|
console.print(f" LLM endpoint: {os.getenv('OPENAI_BASE_URL', 'Not configured')}")
|
|
console.print(f" AL version: {agl.__version__}")
|
|
console.print()
|
|
|
|
console.print("[bold green]✓ Setup test complete![/bold green]\n")
|
|
|
|
# Show next steps
|
|
console.print("[cyan]Next Steps:[/cyan]")
|
|
console.print("1. Configure OpenAI API key or local vLLM endpoint")
|
|
console.print("2. Run: python train_analyzer.py --mode train")
|
|
console.print("3. Review analysis results")
|
|
console.print("4. Validate categorizations (improves rewards)")
|
|
console.print()
|
|
|
|
return {
|
|
"status": "ready",
|
|
"dataset_size": len(dataset),
|
|
"real_feedback": len(real_feedback),
|
|
"synthetic_feedback": len(synthetic_feedback)
|
|
}
|
|
|
|
|
|
def run_training_iteration():
|
|
"""
|
|
Run one training iteration with the analyzer.
|
|
|
|
This is a simplified version that:
|
|
1. Loads training data
|
|
2. Runs analyzer on each example
|
|
3. Collects results and rewards
|
|
4. Displays analysis for manual validation
|
|
|
|
Full AL training (with LightningStore + Trainer) requires GPU.
|
|
"""
|
|
|
|
console.print("[bold cyan]Feedback Analyzer Training Iteration[/bold cyan]\n")
|
|
|
|
# Check for API key
|
|
if not os.getenv("OPENAI_API_KEY") and not os.getenv("OPENAI_BASE_URL"):
|
|
console.print("[red]Error: OPENAI_API_KEY or OPENAI_BASE_URL not configured[/red]")
|
|
console.print("[yellow]Set environment variable or use local vLLM endpoint[/yellow]")
|
|
return {"status": "error", "reason": "no_llm_endpoint"}
|
|
|
|
# Load data
|
|
real_feedback = load_feedback_from_mongodb()
|
|
synthetic_feedback = generate_synthetic_training_data()
|
|
dataset = real_feedback if real_feedback else synthetic_feedback
|
|
|
|
console.print(f"[green]Dataset: {len(dataset)} examples[/green]\n")
|
|
|
|
# Mock LLM endpoint (in production, use real endpoint)
|
|
llm_config = agl.LLM(
|
|
endpoint=os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1"),
|
|
model=os.getenv("OPENAI_MODEL", "gpt-3.5-turbo")
|
|
)
|
|
|
|
# Note: For MVP, we're demonstrating the architecture
|
|
# Full training requires LightningStore + Trainer + GPU
|
|
|
|
console.print("[yellow]Note: Full AL training requires:[/yellow]")
|
|
console.print(" • LightningStore server (agl store)")
|
|
console.print(" • Training algorithm (Tinker/GRPO/PPO)")
|
|
console.print(" • GPU acceleration (ROCm + MS-S1 Max)")
|
|
console.print()
|
|
|
|
console.print("[green]Current Status:[/green]")
|
|
console.print(" ✓ Analyzer agent implemented with @agl.rollout")
|
|
console.print(" ✓ Reward function configured")
|
|
console.print(" ✓ Event emission (emit_message, emit_reward)")
|
|
console.print(" ✓ Training data pipeline ready")
|
|
console.print(" 🚧 LightningStore setup (pending GPU)")
|
|
console.print(" 🚧 Full RL training loop (pending GPU)")
|
|
console.print()
|
|
|
|
return {
|
|
"status": "architecture_ready",
|
|
"dataset_size": len(dataset),
|
|
"agent": "feedback_analyzer_agent",
|
|
"training_mode": "cpu_mvp"
|
|
}
|
|
|
|
|
|
def main():
|
|
"""Entry point for analyzer training."""
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description="Train feedback analyzer agent with Agent Lightning"
|
|
)
|
|
parser.add_argument(
|
|
"--mode",
|
|
type=str,
|
|
choices=["setup", "train"],
|
|
default="setup",
|
|
help="Training mode"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
agl.configure_logger()
|
|
|
|
if args.mode == "setup":
|
|
result = setup_test()
|
|
console.print(f"\n[bold green]Result:[/bold green] {json.dumps(result, indent=2)}\n")
|
|
elif args.mode == "train":
|
|
result = run_training_iteration()
|
|
console.print(f"\n[bold green]Result:[/bold green] {json.dumps(result, indent=2)}\n")
|
|
else:
|
|
parser.print_help()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|