# Burmese translation module using Claude from typing import Dict, Optional from loguru import logger import anthropic import re import config import time class BurmeseTranslator: def __init__(self): self.client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY) self.preserve_terms = config.TRANSLATION['preserve_terms'] def translate_article(self, article: Dict) -> Dict: """Translate compiled article to Burmese""" logger.info(f"Translating article: {article['title'][:50]}...") try: # Translate title title_burmese = self.translate_text( text=article['title'], context="This is an article title about AI technology" ) # Translate excerpt excerpt_burmese = self.translate_text( text=article['excerpt'], context="This is a brief article summary" ) # Translate main content (in chunks if too long) content_burmese = self.translate_long_text(article['content']) # Return article with Burmese translations return { **article, 'title_burmese': title_burmese, 'excerpt_burmese': excerpt_burmese, 'content_burmese': content_burmese } except Exception as e: logger.error(f"Translation error: {e}") # Fallback: return original text if translation fails return { **article, 'title_burmese': article['title'], 'excerpt_burmese': article['excerpt'], 'content_burmese': article['content'] } def translate_text(self, text: str, context: str = "") -> str: """Translate a text block to Burmese""" # Build preserved terms list for this text preserved_terms_str = ", ".join(self.preserve_terms) prompt = f"""Translate the following English text to Burmese (Myanmar Unicode) in a CASUAL, EASY-TO-READ style. 🎯 CRITICAL GUIDELINES: 1. Write in **CASUAL, CONVERSATIONAL Burmese** - like talking to a friend over tea 2. Use **SIMPLE, EVERYDAY words** - avoid formal or academic language 3. Explain technical concepts in **LAYMAN TERMS** - as if explaining to your grandmother 4. Keep these terms in English: {preserved_terms_str} 5. Add **brief explanations** in parentheses for complex terms 6. Use **short sentences** - easy to read on mobile 7. Break up long paragraphs - white space is good 8. Keep markdown formatting (##, **, -, etc.) intact TARGET AUDIENCE: General Myanmar public who are curious about AI but not tech experts TONE: Friendly, approachable, informative but not boring EXAMPLE STYLE: ❌ Bad (too formal): "ယခု နည်းပညာသည် ဉာဏ်ရည်တု ဖြစ်စဉ်များကို အသုံးပြုပါသည်" ✅ Good (casual): "ဒီနည်းပညာက AI (အထက်တန်းကွန်ပျူတာဦးနှောက်) ကို သုံးတာပါ" Context: {context} Text to translate: {text} Casual, easy-to-read Burmese translation:""" try: message = self.client.messages.create( model=config.TRANSLATION['model'], max_tokens=config.TRANSLATION['max_tokens'], temperature=config.TRANSLATION['temperature'], messages=[{"role": "user", "content": prompt}] ) translated = message.content[0].text.strip() # Post-process: ensure Unicode and clean up translated = self.post_process_translation(translated) return translated except Exception as e: logger.error(f"API translation error: {e}") return text # Fallback to original def translate_long_text(self, text: str, chunk_size: int = 2000) -> str: """Translate long text in chunks to stay within token limits""" # If text is short enough, translate directly if len(text) < chunk_size: return self.translate_text(text, context="This is the main article content") # Split into paragraphs paragraphs = text.split('\n\n') # Group paragraphs into chunks chunks = [] current_chunk = "" for para in paragraphs: if len(current_chunk) + len(para) < chunk_size: current_chunk += para + '\n\n' else: if current_chunk: chunks.append(current_chunk.strip()) current_chunk = para + '\n\n' if current_chunk: chunks.append(current_chunk.strip()) logger.info(f"Translating {len(chunks)} chunks...") # Translate each chunk translated_chunks = [] for i, chunk in enumerate(chunks): logger.debug(f"Translating chunk {i+1}/{len(chunks)}") translated = self.translate_text( chunk, context=f"This is part {i+1} of {len(chunks)} of a longer article" ) translated_chunks.append(translated) time.sleep(0.5) # Rate limiting # Join chunks return '\n\n'.join(translated_chunks) def post_process_translation(self, text: str) -> str: """Clean up and validate translation""" # Remove any accidental duplication text = re.sub(r'(\n{3,})', '\n\n', text) # Ensure proper spacing after punctuation text = re.sub(r'([။၊])([^\s])', r'\1 \2', text) # Preserve preserved terms (fix any that got translated) for term in self.preserve_terms: # If the term appears in a weird form, try to fix it # (This is a simple check; more sophisticated matching could be added) if term not in text and term.lower() in text.lower(): text = re.sub(re.escape(term.lower()), term, text, flags=re.IGNORECASE) return text.strip() def validate_burmese_text(self, text: str) -> bool: """Check if text contains valid Burmese Unicode""" # Myanmar Unicode range: U+1000 to U+109F burmese_pattern = re.compile(r'[\u1000-\u109F]') return bool(burmese_pattern.search(text)) def run_translator(compiled_articles: list) -> list: """Translate compiled articles to Burmese""" logger.info(f"Starting translator for {len(compiled_articles)} articles...") start_time = time.time() try: translator = BurmeseTranslator() translated_articles = [] for i, article in enumerate(compiled_articles, 1): logger.info(f"Translating article {i}/{len(compiled_articles)}") try: translated = translator.translate_article(article) # Validate translation if translator.validate_burmese_text(translated['content_burmese']): translated_articles.append(translated) logger.info(f"✓ Translation successful for article {i}") else: logger.warning(f"✗ Translation validation failed for article {i}") # Still add it, but flag it translated_articles.append(translated) time.sleep(1) # Rate limiting except Exception as e: logger.error(f"Error translating article {i}: {e}") continue duration = int(time.time() - start_time) from database import log_pipeline_stage log_pipeline_stage( stage='translate', status='completed', articles_processed=len(translated_articles), duration=duration ) logger.info(f"Translator completed in {duration}s. Articles translated: {len(translated_articles)}") return translated_articles except Exception as e: logger.error(f"Translator failed: {e}") from database import log_pipeline_stage log_pipeline_stage( stage='translate', status='failed', error_message=str(e) ) return [] if __name__ == '__main__': from loguru import logger logger.add(config.LOG_FILE, rotation="1 day") # Test translation test_article = { 'title': 'OpenAI Releases GPT-5: A New Era of AI', 'excerpt': 'OpenAI today announced GPT-5, the next generation of their language model.', 'content': '''OpenAI has officially released GPT-5, marking a significant milestone in artificial intelligence development. ## Key Features The new model includes: - 10x more parameters than GPT-4 - Better reasoning capabilities - Multimodal support for video - Reduced hallucinations CEO Sam Altman said, "GPT-5 represents our most advanced AI system yet." The model will be available to ChatGPT Plus subscribers starting next month.''' } translator = BurmeseTranslator() translated = translator.translate_article(test_article) print("\n=== ORIGINAL ===") print(f"Title: {translated['title']}") print(f"\nContent: {translated['content'][:200]}...") print("\n=== BURMESE ===") print(f"Title: {translated['title_burmese']}") print(f"\nContent: {translated['content_burmese'][:200]}...")