burmddit/backend/translator.py

# Burmese translation module using Claude

from typing import Dict, Optional
from loguru import logger
import anthropic
import re
import config
import time

class BurmeseTranslator:
    def __init__(self):
        self.client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)
        self.preserve_terms = config.TRANSLATION['preserve_terms']

    def translate_article(self, article: Dict) -> Dict:
        """Translate compiled article to Burmese"""
        logger.info(f"Translating article: {article['title'][:50]}...")

        try:
            # Translate title
            title_burmese = self.translate_text(
                text=article['title'],
                context="This is an article title about AI technology"
            )

            # Translate excerpt
            excerpt_burmese = self.translate_text(
                text=article['excerpt'],
                context="This is a brief article summary"
            )

            # Translate main content (in chunks if too long)
            content_burmese = self.translate_long_text(article['content'])

            # Return article with Burmese translations
            return {
                **article,
                'title_burmese': title_burmese,
                'excerpt_burmese': excerpt_burmese,
                'content_burmese': content_burmese
            }

        except Exception as e:
            logger.error(f"Translation error: {e}")
            # Fallback: return original text if translation fails
            return {
                **article,
                'title_burmese': article['title'],
                'excerpt_burmese': article['excerpt'],
                'content_burmese': article['content']
            }

    def translate_text(self, text: str, context: str = "") -> str:
        """Translate a text block to Burmese"""

        # Build preserved terms list for this text
        preserved_terms_str = ", ".join(self.preserve_terms)

        prompt = f"""Translate the following English text to Burmese (Myanmar Unicode) in a CASUAL, EASY-TO-READ style.

🎯 CRITICAL GUIDELINES:
1. Write in **CASUAL, CONVERSATIONAL Burmese** - like talking to a friend over tea
2. Use **SIMPLE, EVERYDAY words** - avoid formal or academic language
3. Explain technical concepts in **LAYMAN TERMS** - as if explaining to your grandmother
4. Keep these terms in English: {preserved_terms_str}
5. Add **brief explanations** in parentheses for complex terms
6. Use **short sentences** - easy to read on mobile
7. Break up long paragraphs - white space is good
8. Keep markdown formatting (##, **, -, etc.) intact

TARGET AUDIENCE: General Myanmar public who are curious about AI but not tech experts

TONE: Friendly, approachable, informative but not boring

EXAMPLE STYLE:
❌ Bad (too formal): "ယခု နည်းပညာသည် ဉာဏ်ရည်တု ဖြစ်စဉ်များကို အသုံးပြုပါသည်"
✅ Good (casual): "ဒီနည်းပညာက AI (အထက်တန်းကွန်ပျူတာဦးနှောက်) ကို သုံးတာပါ"

Context: {context}

Text to translate:
{text}

Casual, easy-to-read Burmese translation:"""

        try:
            message = self.client.messages.create(
                model=config.TRANSLATION['model'],
                max_tokens=config.TRANSLATION['max_tokens'],
                temperature=config.TRANSLATION['temperature'],
                messages=[{"role": "user", "content": prompt}]
            )

            translated = message.content[0].text.strip()

            # Post-process: ensure Unicode and clean up
            translated = self.post_process_translation(translated)

            return translated

        except Exception as e:
            logger.error(f"API translation error: {e}")
            return text  # Fallback to original

    def translate_long_text(self, text: str, chunk_size: int = 2000) -> str:
        """Translate long text in chunks to stay within token limits"""

        # If text is short enough, translate directly
        if len(text) < chunk_size:
            return self.translate_text(text, context="This is the main article content")

        # Split into paragraphs
        paragraphs = text.split('\n\n')

        # Group paragraphs into chunks
        chunks = []
        current_chunk = ""

        for para in paragraphs:
            if len(current_chunk) + len(para) < chunk_size:
                current_chunk += para + '\n\n'
            else:
                if current_chunk:
                    chunks.append(current_chunk.strip())
                current_chunk = para + '\n\n'

        if current_chunk:
            chunks.append(current_chunk.strip())

        logger.info(f"Translating {len(chunks)} chunks...")

        # Translate each chunk
        translated_chunks = []
        for i, chunk in enumerate(chunks):
            logger.debug(f"Translating chunk {i+1}/{len(chunks)}")
            translated = self.translate_text(
                chunk,
                context=f"This is part {i+1} of {len(chunks)} of a longer article"
            )
            translated_chunks.append(translated)
            time.sleep(0.5)  # Rate limiting

        # Join chunks
        return '\n\n'.join(translated_chunks)

    def post_process_translation(self, text: str) -> str:
        """Clean up and validate translation"""

        # Remove any accidental duplication
        text = re.sub(r'(\n{3,})', '\n\n', text)

        # Ensure proper spacing after punctuation
        text = re.sub(r'([။၊])([^\s])', r'\1 \2', text)

        # Preserve preserved terms (fix any that got translated)
        for term in self.preserve_terms:
            # If the term appears in a weird form, try to fix it
            # (This is a simple check; more sophisticated matching could be added)
            if term not in text and term.lower() in text.lower():
                text = re.sub(re.escape(term.lower()), term, text, flags=re.IGNORECASE)

        return text.strip()

    def validate_burmese_text(self, text: str) -> bool:
        """Check if text contains valid Burmese Unicode"""
        # Myanmar Unicode range: U+1000 to U+109F
        burmese_pattern = re.compile(r'[\u1000-\u109F]')
        return bool(burmese_pattern.search(text))

def run_translator(compiled_articles: list) -> list:
    """Translate compiled articles to Burmese"""
    logger.info(f"Starting translator for {len(compiled_articles)} articles...")
    start_time = time.time()

    try:
        translator = BurmeseTranslator()
        translated_articles = []

        for i, article in enumerate(compiled_articles, 1):
            logger.info(f"Translating article {i}/{len(compiled_articles)}")

            try:
                translated = translator.translate_article(article)

                # Validate translation
                if translator.validate_burmese_text(translated['content_burmese']):
                    translated_articles.append(translated)
                    logger.info(f"✓ Translation successful for article {i}")
                else:
                    logger.warning(f"✗ Translation validation failed for article {i}")
                    # Still add it, but flag it
                    translated_articles.append(translated)

                time.sleep(1)  # Rate limiting

            except Exception as e:
                logger.error(f"Error translating article {i}: {e}")
                continue

        duration = int(time.time() - start_time)

        from database import log_pipeline_stage
        log_pipeline_stage(
            stage='translate',
            status='completed',
            articles_processed=len(translated_articles),
            duration=duration
        )

        logger.info(f"Translator completed in {duration}s. Articles translated: {len(translated_articles)}")
        return translated_articles

    except Exception as e:
        logger.error(f"Translator failed: {e}")
        from database import log_pipeline_stage
        log_pipeline_stage(
            stage='translate',
            status='failed',
            error_message=str(e)
        )
        return []

if __name__ == '__main__':
    from loguru import logger
    logger.add(config.LOG_FILE, rotation="1 day")

    # Test translation
    test_article = {
        'title': 'OpenAI Releases GPT-5: A New Era of AI',
        'excerpt': 'OpenAI today announced GPT-5, the next generation of their language model.',
        'content': '''OpenAI has officially released GPT-5, marking a significant milestone in artificial intelligence development.

## Key Features

The new model includes:
- 10x more parameters than GPT-4
- Better reasoning capabilities
- Multimodal support for video
- Reduced hallucinations

CEO Sam Altman said, "GPT-5 represents our most advanced AI system yet."

The model will be available to ChatGPT Plus subscribers starting next month.'''
    }

    translator = BurmeseTranslator()
    translated = translator.translate_article(test_article)

    print("\n=== ORIGINAL ===")
    print(f"Title: {translated['title']}")
    print(f"\nContent: {translated['content'][:200]}...")

    print("\n=== BURMESE ===")
    print(f"Title: {translated['title_burmese']}")
    print(f"\nContent: {translated['content_burmese'][:200]}...")