# Improved Burmese translation module with better error handling from typing import Dict, Optional from loguru import logger import anthropic import re import config import time class BurmeseTranslator: def __init__(self): self.client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY) self.preserve_terms = config.TRANSLATION['preserve_terms'] def translate_article(self, article: Dict) -> Dict: """Translate compiled article to Burmese""" logger.info(f"Translating article: {article['title'][:50]}...") try: # Translate title title_burmese = self.translate_text( text=article['title'], context="This is an article title about AI technology", max_length=200 ) # Translate excerpt excerpt_burmese = self.translate_text( text=article['excerpt'], context="This is a brief article summary", max_length=300 ) # Translate main content with improved chunking content_burmese = self.translate_long_text( article['content'], chunk_size=1200 # Reduced from 2000 for safety ) # Validate translation quality if not self.validate_translation(content_burmese, article['content']): logger.warning(f"Translation validation failed, using fallback") # Try again with smaller chunks content_burmese = self.translate_long_text( article['content'], chunk_size=800 # Even smaller ) # Return article with Burmese translations return { **article, 'title_burmese': title_burmese, 'excerpt_burmese': excerpt_burmese, 'content_burmese': content_burmese } except Exception as e: logger.error(f"Translation error: {e}") # Fallback: return original text if translation fails return { **article, 'title_burmese': article['title'], 'excerpt_burmese': article['excerpt'], 'content_burmese': article['content'] } def translate_text(self, text: str, context: str = "", max_length: int = None) -> str: """Translate a text block to Burmese with improved prompting""" # Build preserved terms list preserved_terms_str = ", ".join(self.preserve_terms) # Add length guidance if specified length_guidance = "" if max_length: length_guidance = f"\nāš ļø IMPORTANT: Keep translation under {max_length} words. Be concise." prompt = f"""Translate the following English text to Burmese (Myanmar Unicode) in a CASUAL, EASY-TO-READ style. šŸŽÆ CRITICAL GUIDELINES: 1. Write in **CASUAL, CONVERSATIONAL Burmese** - like talking to a friend 2. Use **SIMPLE, EVERYDAY words** - avoid formal or academic language 3. Explain technical concepts in **LAYMAN TERMS** 4. Keep these terms in English: {preserved_terms_str} 5. Add **brief explanations** in parentheses for complex terms 6. Use **short sentences** - easy to read on mobile 7. Break up long paragraphs - white space is good 8. Keep markdown formatting (##, **, -, etc.) intact{length_guidance} 🚫 CRITICAL: DO NOT REPEAT TEXT OR GET STUCK IN LOOPS! - If you start repeating, STOP immediately - Translate fully but concisely - Each sentence should be unique TARGET AUDIENCE: General Myanmar public curious about AI Context: {context} Text to translate: {text} Burmese translation (natural, concise, no repetitions):""" try: message = self.client.messages.create( model=config.TRANSLATION['model'], max_tokens=min(config.TRANSLATION['max_tokens'], 3000), # Cap at 3000 temperature=config.TRANSLATION['temperature'], messages=[{"role": "user", "content": prompt}] ) translated = message.content[0].text.strip() # Post-process and validate translated = self.post_process_translation(translated) # Check for hallucination/loops if self.detect_repetition(translated): logger.warning("Detected repetitive text, retrying with lower temperature") # Retry with lower temperature message = self.client.messages.create( model=config.TRANSLATION['model'], max_tokens=min(config.TRANSLATION['max_tokens'], 3000), temperature=0.3, # Lower temperature messages=[{"role": "user", "content": prompt}] ) translated = message.content[0].text.strip() translated = self.post_process_translation(translated) return translated except Exception as e: logger.error(f"API translation error: {e}") return text # Fallback to original def translate_long_text(self, text: str, chunk_size: int = 1200) -> str: """Translate long text in chunks with better error handling""" # If text is short enough, translate directly if len(text) < chunk_size: return self.translate_text(text, context="This is the main article content") logger.info(f"Article is {len(text)} chars, splitting into chunks...") # Split into paragraphs first paragraphs = text.split('\n\n') # Group paragraphs into chunks (more conservative sizing) chunks = [] current_chunk = "" for para in paragraphs: # Check if adding this paragraph would exceed chunk size if len(current_chunk) + len(para) + 4 < chunk_size: # +4 for \n\n if current_chunk: current_chunk += '\n\n' + para else: current_chunk = para else: # Current chunk is full, save it if current_chunk: chunks.append(current_chunk.strip()) # Start new chunk with this paragraph # If paragraph itself is too long, split it further if len(para) > chunk_size: # Split long paragraph by sentences sentences = para.split('. ') temp_chunk = "" for sent in sentences: if len(temp_chunk) + len(sent) + 2 < chunk_size: temp_chunk += sent + '. ' else: if temp_chunk: chunks.append(temp_chunk.strip()) temp_chunk = sent + '. ' current_chunk = temp_chunk else: current_chunk = para # Don't forget the last chunk if current_chunk: chunks.append(current_chunk.strip()) logger.info(f"Split into {len(chunks)} chunks (avg {len(text)//len(chunks)} chars each)") # Translate each chunk with progress tracking translated_chunks = [] failed_chunks = 0 for i, chunk in enumerate(chunks): logger.info(f"Translating chunk {i+1}/{len(chunks)} ({len(chunk)} chars)...") try: translated = self.translate_text( chunk, context=f"This is part {i+1} of {len(chunks)} of a longer article" ) # Validate chunk translation if self.detect_repetition(translated): logger.warning(f"Chunk {i+1} has repetition, retrying...") time.sleep(1) translated = self.translate_text( chunk, context=f"This is part {i+1} of {len(chunks)} - translate fully without repetition" ) translated_chunks.append(translated) time.sleep(0.5) # Rate limiting except Exception as e: logger.error(f"Failed to translate chunk {i+1}: {e}") failed_chunks += 1 # Use original text as fallback for this chunk translated_chunks.append(chunk) time.sleep(1) if failed_chunks > 0: logger.warning(f"{failed_chunks}/{len(chunks)} chunks failed translation") # Join chunks result = '\n\n'.join(translated_chunks) logger.info(f"Translation complete: {len(result)} chars (original: {len(text)} chars)") return result def detect_repetition(self, text: str, threshold: int = 5) -> bool: """Detect if text has repetitive patterns (hallucination)""" if len(text) < 100: return False # Check for repeated phrases (5+ words) words = text.split() if len(words) < 10: return False # Look for 5-word sequences that appear multiple times sequences = {} for i in range(len(words) - 4): seq = ' '.join(words[i:i+5]) sequences[seq] = sequences.get(seq, 0) + 1 # If any sequence appears 3+ times, it's likely repetition max_repetitions = max(sequences.values()) if sequences else 0 if max_repetitions >= threshold: logger.warning(f"Detected repetition: {max_repetitions} occurrences") return True return False def validate_translation(self, translated: str, original: str) -> bool: """Validate translation quality""" # Check 1: Not empty if not translated or len(translated) < 50: logger.warning("Translation too short") return False # Check 2: Has Burmese Unicode if not self.validate_burmese_text(translated): logger.warning("Translation missing Burmese text") return False # Check 3: Reasonable length ratio (translated should be 50-200% of original) ratio = len(translated) / len(original) if ratio < 0.3 or ratio > 3.0: logger.warning(f"Translation length ratio suspicious: {ratio:.2f}") return False # Check 4: No repetition if self.detect_repetition(translated): logger.warning("Translation has repetitive patterns") return False return True def post_process_translation(self, text: str) -> str: """Clean up and validate translation""" # Remove excessive newlines text = re.sub(r'(\n{3,})', '\n\n', text) # Remove leading/trailing whitespace from each line lines = [line.strip() for line in text.split('\n')] text = '\n'.join(lines) # Ensure proper spacing after Burmese punctuation text = re.sub(r'([į‹įŠ])([^\s])', r'\1 \2', text) # Remove any accidental English remnants that shouldn't be there # (but preserve the terms we want to keep) return text.strip() def validate_burmese_text(self, text: str) -> bool: """Check if text contains valid Burmese Unicode""" # Myanmar Unicode range: U+1000 to U+109F burmese_pattern = re.compile(r'[\u1000-\u109F]') return bool(burmese_pattern.search(text)) def run_translator(compiled_articles: list) -> list: """Translate compiled articles to Burmese""" logger.info(f"Starting translator for {len(compiled_articles)} articles...") start_time = time.time() try: translator = BurmeseTranslator() translated_articles = [] for i, article in enumerate(compiled_articles, 1): logger.info(f"Translating article {i}/{len(compiled_articles)}") try: translated_article = translator.translate_article(article) translated_articles.append(translated_article) logger.info(f"āœ“ Translation successful for article {i}") except Exception as e: logger.error(f"Failed to translate article {i}: {e}") # Add article with original English text as fallback translated_articles.append({ **article, 'title_burmese': article['title'], 'excerpt_burmese': article['excerpt'], 'content_burmese': article['content'] }) duration = int(time.time() - start_time) logger.info(f"Translator completed in {duration}s. Articles translated: {len(translated_articles)}") return translated_articles except Exception as e: logger.error(f"Translator failed: {e}") return compiled_articles # Return originals as fallback if __name__ == '__main__': # Test the translator test_article = { 'title': 'Test Article About AI', 'excerpt': 'This is a test excerpt about artificial intelligence.', 'content': 'This is test content. ' * 100 # Long content } translator = BurmeseTranslator() result = translator.translate_article(test_article) print("Title:", result['title_burmese']) print("Excerpt:", result['excerpt_burmese']) print("Content length:", len(result['content_burmese']))