tractatus/scripts/translate-glossary.py
TheFlow f0db6052ad feat: add German and French glossary translations via DeepL
Created translations using DeepL API:
- GLOSSARY-DE.md (67KB, German translation)
- GLOSSARY-FR.md (71KB, French translation)

Added translate-glossary.py script for automated translation with:
- Frontmatter preservation
- Chunked translation for large documents
- DeepL API integration

Updated generate-public-pdfs.js to include:
- tractatus-agentic-governance-system-glossary-of-terms-deutsch
- tractatus-agentic-governance-system-glossary-of-terms-franais

Both documents migrated to database and PDFs generated locally.
Production deployment will generate PDFs on server.

Note: Port numbers (27027/27017) are part of canonical "27027 Incident"
educational example, not actual infrastructure exposure.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-01 09:53:25 +13:00

146 lines
4.4 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Translate Glossary using DeepL API
Handles large documents by chunking content
"""
import os
import re
import sys
import deepl
from pathlib import Path
# Load API key from environment
API_KEY = os.getenv('DEEPL_API_KEY')
if not API_KEY:
print("Error: DEEPL_API_KEY not found in environment")
sys.exit(1)
translator = deepl.Translator(API_KEY)
def extract_frontmatter(content):
"""Extract YAML frontmatter and content"""
match = re.match(r'^---\n(.*?)\n---\n(.*)$', content, re.DOTALL)
if match:
return match.group(1), match.group(2)
return None, content
def update_frontmatter(frontmatter, lang_code, lang_name):
"""Update frontmatter for translated version"""
lines = frontmatter.split('\n')
updated = []
for line in lines:
if line.startswith('title:'):
# Keep original title, add language suffix
title = line.split(':', 1)[1].strip()
updated.append(f'title: {title} ({lang_name})')
elif line.startswith('slug:'):
updated.append(f'slug: glossary-{lang_code}')
elif line.startswith('modified:'):
updated.append('modified: 2025-11-01')
else:
updated.append(line)
return '\n'.join(updated)
def chunk_text(text, max_chunk_size=50000):
"""Split text into chunks at paragraph boundaries"""
paragraphs = text.split('\n\n')
chunks = []
current_chunk = []
current_size = 0
for para in paragraphs:
para_size = len(para)
if current_size + para_size > max_chunk_size and current_chunk:
chunks.append('\n\n'.join(current_chunk))
current_chunk = [para]
current_size = para_size
else:
current_chunk.append(para)
current_size += para_size
if current_chunk:
chunks.append('\n\n'.join(current_chunk))
return chunks
def translate_content(content, target_lang):
"""Translate content in chunks"""
print(f" Translating to {target_lang}...")
print(f" Content length: {len(content)} characters")
chunks = chunk_text(content, max_chunk_size=50000)
print(f" Split into {len(chunks)} chunks")
translated_chunks = []
for i, chunk in enumerate(chunks, 1):
print(f" Translating chunk {i}/{len(chunks)}...", end='', flush=True)
try:
result = translator.translate_text(
chunk,
target_lang=target_lang,
preserve_formatting=True,
tag_handling='html'
)
translated_chunks.append(result.text)
print("")
except Exception as e:
print(f" ✗ Error: {e}")
raise
return '\n\n'.join(translated_chunks)
def translate_glossary(input_file, output_file, lang_code, lang_name):
"""Translate glossary file"""
print(f"\n{'='*60}")
print(f"Translating Glossary to {lang_name} ({lang_code})")
print(f"{'='*60}\n")
# Read input file
print(f"Reading: {input_file}")
with open(input_file, 'r', encoding='utf-8') as f:
content = f.read()
# Extract frontmatter and content
frontmatter, main_content = extract_frontmatter(content)
if frontmatter:
print("Frontmatter extracted")
updated_frontmatter = update_frontmatter(frontmatter, lang_code, lang_name)
else:
print("No frontmatter found")
updated_frontmatter = None
# Translate content
translated_content = translate_content(main_content, lang_code.upper())
# Reassemble
if updated_frontmatter:
final_content = f"---\n{updated_frontmatter}\n---\n{translated_content}"
else:
final_content = translated_content
# Write output
print(f"\nWriting: {output_file}")
with open(output_file, 'w', encoding='utf-8') as f:
f.write(final_content)
print(f"✓ Translation complete!")
print(f" Output: {output_file}")
print(f" Size: {len(final_content)} characters\n")
if __name__ == '__main__':
base_dir = Path(__file__).parent.parent
input_file = base_dir / 'docs/markdown/GLOSSARY.md'
# Translate to German
output_de = base_dir / 'docs/markdown/GLOSSARY-DE.md'
translate_glossary(input_file, output_de, 'de', 'Deutsch')
# Translate to French
output_fr = base_dir / 'docs/markdown/GLOSSARY-FR.md'
translate_glossary(input_file, output_fr, 'fr', 'Français')
print(f"\n{'='*60}")
print("All translations complete!")
print(f"{'='*60}")