tractatus/scripts/translate-glossary.py

#!/usr/bin/env python3
"""
Translate Glossary using DeepL API
Handles large documents by chunking content
"""

import os
import re
import sys
import deepl
from pathlib import Path

# Load API key from environment
API_KEY = os.getenv('DEEPL_API_KEY')
if not API_KEY:
    print("Error: DEEPL_API_KEY not found in environment")
    sys.exit(1)

translator = deepl.Translator(API_KEY)

def extract_frontmatter(content):
    """Extract YAML frontmatter and content"""
    match = re.match(r'^---\n(.*?)\n---\n(.*)$', content, re.DOTALL)
    if match:
        return match.group(1), match.group(2)
    return None, content

def update_frontmatter(frontmatter, lang_code, lang_name):
    """Update frontmatter for translated version"""
    lines = frontmatter.split('\n')
    updated = []
    for line in lines:
        if line.startswith('title:'):
            # Keep original title, add language suffix
            title = line.split(':', 1)[1].strip()
            updated.append(f'title: {title} ({lang_name})')
        elif line.startswith('slug:'):
            updated.append(f'slug: glossary-{lang_code}')
        elif line.startswith('modified:'):
            updated.append('modified: 2025-11-01')
        else:
            updated.append(line)
    return '\n'.join(updated)

def chunk_text(text, max_chunk_size=50000):
    """Split text into chunks at paragraph boundaries"""
    paragraphs = text.split('\n\n')
    chunks = []
    current_chunk = []
    current_size = 0

    for para in paragraphs:
        para_size = len(para)
        if current_size + para_size > max_chunk_size and current_chunk:
            chunks.append('\n\n'.join(current_chunk))
            current_chunk = [para]
            current_size = para_size
        else:
            current_chunk.append(para)
            current_size += para_size

    if current_chunk:
        chunks.append('\n\n'.join(current_chunk))

    return chunks

def translate_content(content, target_lang):
    """Translate content in chunks"""
    print(f"  Translating to {target_lang}...")
    print(f"  Content length: {len(content)} characters")

    chunks = chunk_text(content, max_chunk_size=50000)
    print(f"  Split into {len(chunks)} chunks")

    translated_chunks = []
    for i, chunk in enumerate(chunks, 1):
        print(f"  Translating chunk {i}/{len(chunks)}...", end='', flush=True)
        try:
            result = translator.translate_text(
                chunk,
                target_lang=target_lang,
                preserve_formatting=True,
                tag_handling='html'
            )
            translated_chunks.append(result.text)
            print(" ✓")
        except Exception as e:
            print(f" ✗ Error: {e}")
            raise

    return '\n\n'.join(translated_chunks)

def translate_glossary(input_file, output_file, lang_code, lang_name):
    """Translate glossary file"""
    print(f"\n{'='*60}")
    print(f"Translating Glossary to {lang_name} ({lang_code})")
    print(f"{'='*60}\n")

    # Read input file
    print(f"Reading: {input_file}")
    with open(input_file, 'r', encoding='utf-8') as f:
        content = f.read()

    # Extract frontmatter and content
    frontmatter, main_content = extract_frontmatter(content)

    if frontmatter:
        print("Frontmatter extracted")
        updated_frontmatter = update_frontmatter(frontmatter, lang_code, lang_name)
    else:
        print("No frontmatter found")
        updated_frontmatter = None

    # Translate content
    translated_content = translate_content(main_content, lang_code.upper())

    # Reassemble
    if updated_frontmatter:
        final_content = f"---\n{updated_frontmatter}\n---\n{translated_content}"
    else:
        final_content = translated_content

    # Write output
    print(f"\nWriting: {output_file}")
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(final_content)

    print(f"✓ Translation complete!")
    print(f"  Output: {output_file}")
    print(f"  Size: {len(final_content)} characters\n")

if __name__ == '__main__':
    base_dir = Path(__file__).parent.parent
    input_file = base_dir / 'docs/markdown/GLOSSARY.md'

    # Translate to German
    output_de = base_dir / 'docs/markdown/GLOSSARY-DE.md'
    translate_glossary(input_file, output_de, 'de', 'Deutsch')

    # Translate to French
    output_fr = base_dir / 'docs/markdown/GLOSSARY-FR.md'
    translate_glossary(input_file, output_fr, 'fr', 'Français')

    print(f"\n{'='*60}")
    print("All translations complete!")
    print(f"{'='*60}")