#!/usr/bin/env python3 """ Translate Glossary using DeepL API Handles large documents by chunking content """ import os import re import sys import deepl from pathlib import Path # Load API key from environment API_KEY = os.getenv('DEEPL_API_KEY') if not API_KEY: print("Error: DEEPL_API_KEY not found in environment") sys.exit(1) translator = deepl.Translator(API_KEY) def extract_frontmatter(content): """Extract YAML frontmatter and content""" match = re.match(r'^---\n(.*?)\n---\n(.*)$', content, re.DOTALL) if match: return match.group(1), match.group(2) return None, content def update_frontmatter(frontmatter, lang_code, lang_name): """Update frontmatter for translated version""" lines = frontmatter.split('\n') updated = [] for line in lines: if line.startswith('title:'): # Keep original title, add language suffix title = line.split(':', 1)[1].strip() updated.append(f'title: {title} ({lang_name})') elif line.startswith('slug:'): updated.append(f'slug: glossary-{lang_code}') elif line.startswith('modified:'): updated.append('modified: 2025-11-01') else: updated.append(line) return '\n'.join(updated) def chunk_text(text, max_chunk_size=50000): """Split text into chunks at paragraph boundaries""" paragraphs = text.split('\n\n') chunks = [] current_chunk = [] current_size = 0 for para in paragraphs: para_size = len(para) if current_size + para_size > max_chunk_size and current_chunk: chunks.append('\n\n'.join(current_chunk)) current_chunk = [para] current_size = para_size else: current_chunk.append(para) current_size += para_size if current_chunk: chunks.append('\n\n'.join(current_chunk)) return chunks def translate_content(content, target_lang): """Translate content in chunks""" print(f" Translating to {target_lang}...") print(f" Content length: {len(content)} characters") chunks = chunk_text(content, max_chunk_size=50000) print(f" Split into {len(chunks)} chunks") translated_chunks = [] for i, chunk in enumerate(chunks, 1): print(f" Translating chunk {i}/{len(chunks)}...", end='', flush=True) try: result = translator.translate_text( chunk, target_lang=target_lang, preserve_formatting=True, tag_handling='html' ) translated_chunks.append(result.text) print(" ✓") except Exception as e: print(f" ✗ Error: {e}") raise return '\n\n'.join(translated_chunks) def translate_glossary(input_file, output_file, lang_code, lang_name): """Translate glossary file""" print(f"\n{'='*60}") print(f"Translating Glossary to {lang_name} ({lang_code})") print(f"{'='*60}\n") # Read input file print(f"Reading: {input_file}") with open(input_file, 'r', encoding='utf-8') as f: content = f.read() # Extract frontmatter and content frontmatter, main_content = extract_frontmatter(content) if frontmatter: print("Frontmatter extracted") updated_frontmatter = update_frontmatter(frontmatter, lang_code, lang_name) else: print("No frontmatter found") updated_frontmatter = None # Translate content translated_content = translate_content(main_content, lang_code.upper()) # Reassemble if updated_frontmatter: final_content = f"---\n{updated_frontmatter}\n---\n{translated_content}" else: final_content = translated_content # Write output print(f"\nWriting: {output_file}") with open(output_file, 'w', encoding='utf-8') as f: f.write(final_content) print(f"✓ Translation complete!") print(f" Output: {output_file}") print(f" Size: {len(final_content)} characters\n") if __name__ == '__main__': base_dir = Path(__file__).parent.parent input_file = base_dir / 'docs/markdown/GLOSSARY.md' # Translate to German output_de = base_dir / 'docs/markdown/GLOSSARY-DE.md' translate_glossary(input_file, output_de, 'de', 'Deutsch') # Translate to French output_fr = base_dir / 'docs/markdown/GLOSSARY-FR.md' translate_glossary(input_file, output_fr, 'fr', 'Français') print(f"\n{'='*60}") print("All translations complete!") print(f"{'='*60}")