burmddit/backend/translator_v2.py

# Improved Burmese translation module with better error handling

from typing import Dict, Optional
from loguru import logger
import anthropic
import re
import config
import time

class BurmeseTranslator:
    def __init__(self):
        self.client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)
        self.preserve_terms = config.TRANSLATION['preserve_terms']

    def translate_article(self, article: Dict) -> Dict:
        """Translate compiled article to Burmese"""
        logger.info(f"Translating article: {article['title'][:50]}...")

        try:
            # Translate title
            title_burmese = self.translate_text(
                text=article['title'],
                context="This is an article title about AI technology",
                max_length=200
            )

            # Translate excerpt
            excerpt_burmese = self.translate_text(
                text=article['excerpt'],
                context="This is a brief article summary",
                max_length=300
            )

            # Translate main content with improved chunking
            content_burmese = self.translate_long_text(
                article['content'],
                chunk_size=1200  # Reduced from 2000 for safety
            )

            # Validate translation quality
            if not self.validate_translation(content_burmese, article['content']):
                logger.warning(f"Translation validation failed, using fallback")
                # Try again with smaller chunks
                content_burmese = self.translate_long_text(
                    article['content'],
                    chunk_size=800  # Even smaller
                )

            # Return article with Burmese translations
            return {
                **article,
                'title_burmese': title_burmese,
                'excerpt_burmese': excerpt_burmese,
                'content_burmese': content_burmese
            }

        except Exception as e:
            logger.error(f"Translation error: {e}")
            # Fallback: return original text if translation fails
            return {
                **article,
                'title_burmese': article['title'],
                'excerpt_burmese': article['excerpt'],
                'content_burmese': article['content']
            }

    def translate_text(self, text: str, context: str = "", max_length: int = None) -> str:
        """Translate a text block to Burmese with improved prompting"""

        # Build preserved terms list
        preserved_terms_str = ", ".join(self.preserve_terms)

        # Add length guidance if specified
        length_guidance = ""
        if max_length:
            length_guidance = f"\n⚠️ IMPORTANT: Keep translation under {max_length} words. Be concise."

        prompt = f"""Translate the following English text to Burmese (Myanmar Unicode) in a CASUAL, EASY-TO-READ style.

🎯 CRITICAL GUIDELINES:
1. Write in **CASUAL, CONVERSATIONAL Burmese** - like talking to a friend
2. Use **SIMPLE, EVERYDAY words** - avoid formal or academic language
3. Explain technical concepts in **LAYMAN TERMS**
4. Keep these terms in English: {preserved_terms_str}
5. Add **brief explanations** in parentheses for complex terms
6. Use **short sentences** - easy to read on mobile
7. Break up long paragraphs - white space is good
8. Keep markdown formatting (##, **, -, etc.) intact{length_guidance}

🚫 CRITICAL: DO NOT REPEAT TEXT OR GET STUCK IN LOOPS!
- If you start repeating, STOP immediately
- Translate fully but concisely
- Each sentence should be unique

TARGET AUDIENCE: General Myanmar public curious about AI

Context: {context}

Text to translate:
{text}

Burmese translation (natural, concise, no repetitions):"""

        try:
            message = self.client.messages.create(
                model=config.TRANSLATION['model'],
                max_tokens=min(config.TRANSLATION['max_tokens'], 3000),  # Cap at 3000
                temperature=config.TRANSLATION['temperature'],
                messages=[{"role": "user", "content": prompt}]
            )

            translated = message.content[0].text.strip()

            # Post-process and validate
            translated = self.post_process_translation(translated)

            # Check for hallucination/loops
            if self.detect_repetition(translated):
                logger.warning("Detected repetitive text, retrying with lower temperature")
                # Retry with lower temperature
                message = self.client.messages.create(
                    model=config.TRANSLATION['model'],
                    max_tokens=min(config.TRANSLATION['max_tokens'], 3000),
                    temperature=0.3,  # Lower temperature
                    messages=[{"role": "user", "content": prompt}]
                )
                translated = message.content[0].text.strip()
                translated = self.post_process_translation(translated)

            return translated

        except Exception as e:
            logger.error(f"API translation error: {e}")
            return text  # Fallback to original

    def translate_long_text(self, text: str, chunk_size: int = 1200) -> str:
        """Translate long text in chunks with better error handling"""

        # If text is short enough, translate directly
        if len(text) < chunk_size:
            return self.translate_text(text, context="This is the main article content")

        logger.info(f"Article is {len(text)} chars, splitting into chunks...")

        # Split into paragraphs first
        paragraphs = text.split('\n\n')

        # Group paragraphs into chunks (more conservative sizing)
        chunks = []
        current_chunk = ""

        for para in paragraphs:
            # Check if adding this paragraph would exceed chunk size
            if len(current_chunk) + len(para) + 4 < chunk_size:  # +4 for \n\n
                if current_chunk:
                    current_chunk += '\n\n' + para
                else:
                    current_chunk = para
            else:
                # Current chunk is full, save it
                if current_chunk:
                    chunks.append(current_chunk.strip())

                # Start new chunk with this paragraph
                # If paragraph itself is too long, split it further
                if len(para) > chunk_size:
                    # Split long paragraph by sentences
                    sentences = para.split('. ')
                    temp_chunk = ""
                    for sent in sentences:
                        if len(temp_chunk) + len(sent) + 2 < chunk_size:
                            temp_chunk += sent + '. '
                        else:
                            if temp_chunk:
                                chunks.append(temp_chunk.strip())
                            temp_chunk = sent + '. '
                    current_chunk = temp_chunk
                else:
                    current_chunk = para

        # Don't forget the last chunk
        if current_chunk:
            chunks.append(current_chunk.strip())

        logger.info(f"Split into {len(chunks)} chunks (avg {len(text)//len(chunks)} chars each)")

        # Translate each chunk with progress tracking
        translated_chunks = []
        failed_chunks = 0

        for i, chunk in enumerate(chunks):
            logger.info(f"Translating chunk {i+1}/{len(chunks)} ({len(chunk)} chars)...")

            try:
                translated = self.translate_text(
                    chunk,
                    context=f"This is part {i+1} of {len(chunks)} of a longer article"
                )

                # Validate chunk translation
                if self.detect_repetition(translated):
                    logger.warning(f"Chunk {i+1} has repetition, retrying...")
                    time.sleep(1)
                    translated = self.translate_text(
                        chunk,
                        context=f"This is part {i+1} of {len(chunks)} - translate fully without repetition"
                    )

                translated_chunks.append(translated)
                time.sleep(0.5)  # Rate limiting

            except Exception as e:
                logger.error(f"Failed to translate chunk {i+1}: {e}")
                failed_chunks += 1
                # Use original text as fallback for this chunk
                translated_chunks.append(chunk)
                time.sleep(1)

        if failed_chunks > 0:
            logger.warning(f"{failed_chunks}/{len(chunks)} chunks failed translation")

        # Join chunks
        result = '\n\n'.join(translated_chunks)
        logger.info(f"Translation complete: {len(result)} chars (original: {len(text)} chars)")

        return result

    def detect_repetition(self, text: str, threshold: int = 5) -> bool:
        """Detect if text has repetitive patterns (hallucination)"""
        if len(text) < 100:
            return False

        # Check for repeated phrases (5+ words)
        words = text.split()
        if len(words) < 10:
            return False

        # Look for 5-word sequences that appear multiple times
        sequences = {}
        for i in range(len(words) - 4):
            seq = ' '.join(words[i:i+5])
            sequences[seq] = sequences.get(seq, 0) + 1

        # If any sequence appears 3+ times, it's likely repetition
        max_repetitions = max(sequences.values()) if sequences else 0

        if max_repetitions >= threshold:
            logger.warning(f"Detected repetition: {max_repetitions} occurrences")
            return True

        return False

    def validate_translation(self, translated: str, original: str) -> bool:
        """Validate translation quality"""

        # Check 1: Not empty
        if not translated or len(translated) < 50:
            logger.warning("Translation too short")
            return False

        # Check 2: Has Burmese Unicode
        if not self.validate_burmese_text(translated):
            logger.warning("Translation missing Burmese text")
            return False

        # Check 3: Reasonable length ratio (translated should be 50-200% of original)
        ratio = len(translated) / len(original)
        if ratio < 0.3 or ratio > 3.0:
            logger.warning(f"Translation length ratio suspicious: {ratio:.2f}")
            return False

        # Check 4: No repetition
        if self.detect_repetition(translated):
            logger.warning("Translation has repetitive patterns")
            return False

        return True

    def post_process_translation(self, text: str) -> str:
        """Clean up and validate translation"""

        # Remove excessive newlines
        text = re.sub(r'(\n{3,})', '\n\n', text)

        # Remove leading/trailing whitespace from each line
        lines = [line.strip() for line in text.split('\n')]
        text = '\n'.join(lines)

        # Ensure proper spacing after Burmese punctuation
        text = re.sub(r'([။၊])([^\s])', r'\1 \2', text)

        # Remove any accidental English remnants that shouldn't be there
        # (but preserve the terms we want to keep)

        return text.strip()

    def validate_burmese_text(self, text: str) -> bool:
        """Check if text contains valid Burmese Unicode"""
        # Myanmar Unicode range: U+1000 to U+109F
        burmese_pattern = re.compile(r'[\u1000-\u109F]')
        return bool(burmese_pattern.search(text))

def run_translator(compiled_articles: list) -> list:
    """Translate compiled articles to Burmese"""
    logger.info(f"Starting translator for {len(compiled_articles)} articles...")
    start_time = time.time()

    try:
        translator = BurmeseTranslator()
        translated_articles = []

        for i, article in enumerate(compiled_articles, 1):
            logger.info(f"Translating article {i}/{len(compiled_articles)}")

            try:
                translated_article = translator.translate_article(article)
                translated_articles.append(translated_article)
                logger.info(f"✓ Translation successful for article {i}")

            except Exception as e:
                logger.error(f"Failed to translate article {i}: {e}")
                # Add article with original English text as fallback
                translated_articles.append({
                    **article,
                    'title_burmese': article['title'],
                    'excerpt_burmese': article['excerpt'],
                    'content_burmese': article['content']
                })

        duration = int(time.time() - start_time)
        logger.info(f"Translator completed in {duration}s. Articles translated: {len(translated_articles)}")

        return translated_articles

    except Exception as e:
        logger.error(f"Translator failed: {e}")
        return compiled_articles  # Return originals as fallback

if __name__ == '__main__':
    # Test the translator
    test_article = {
        'title': 'Test Article About AI',
        'excerpt': 'This is a test excerpt about artificial intelligence.',
        'content': 'This is test content. ' * 100  # Long content
    }

    translator = BurmeseTranslator()
    result = translator.translate_article(test_article)

    print("Title:", result['title_burmese'])
    print("Excerpt:", result['excerpt_burmese'])
    print("Content length:", len(result['content_burmese']))