Initial Burmddit deployment - AI news aggregator in Burmese

2026-02-19 02:52:58 +00:00
commit dddb86ea94
27 changed files with 5039 additions and 0 deletions
--- a/backend/translator.py
+++ b/backend/translator.py
@@ -0,0 +1,255 @@
+# Burmese translation module using Claude
+
+from typing import Dict, Optional
+from loguru import logger
+import anthropic
+import re
+import config
+import time
+
+class BurmeseTranslator:
+    def __init__(self):
+        self.client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)
+        self.preserve_terms = config.TRANSLATION['preserve_terms']
+    
+    def translate_article(self, article: Dict) -> Dict:
+        """Translate compiled article to Burmese"""
+        logger.info(f"Translating article: {article['title'][:50]}...")
+        
+        try:
+            # Translate title
+            title_burmese = self.translate_text(
+                text=article['title'],
+                context="This is an article title about AI technology"
+            )
+            
+            # Translate excerpt
+            excerpt_burmese = self.translate_text(
+                text=article['excerpt'],
+                context="This is a brief article summary"
+            )
+            
+            # Translate main content (in chunks if too long)
+            content_burmese = self.translate_long_text(article['content'])
+            
+            # Return article with Burmese translations
+            return {
+                **article,
+                'title_burmese': title_burmese,
+                'excerpt_burmese': excerpt_burmese,
+                'content_burmese': content_burmese
+            }
+        
+        except Exception as e:
+            logger.error(f"Translation error: {e}")
+            # Fallback: return original text if translation fails
+            return {
+                **article,
+                'title_burmese': article['title'],
+                'excerpt_burmese': article['excerpt'],
+                'content_burmese': article['content']
+            }
+    
+    def translate_text(self, text: str, context: str = "") -> str:
+        """Translate a text block to Burmese"""
+        
+        # Build preserved terms list for this text
+        preserved_terms_str = ", ".join(self.preserve_terms)
+        
+        prompt = f"""Translate the following English text to Burmese (Myanmar Unicode) in a CASUAL, EASY-TO-READ style.
+
+🎯 CRITICAL GUIDELINES:
+1. Write in **CASUAL, CONVERSATIONAL Burmese** - like talking to a friend over tea
+2. Use **SIMPLE, EVERYDAY words** - avoid formal or academic language
+3. Explain technical concepts in **LAYMAN TERMS** - as if explaining to your grandmother
+4. Keep these terms in English: {preserved_terms_str}
+5. Add **brief explanations** in parentheses for complex terms
+6. Use **short sentences** - easy to read on mobile
+7. Break up long paragraphs - white space is good
+8. Keep markdown formatting (##, **, -, etc.) intact
+
+TARGET AUDIENCE: General Myanmar public who are curious about AI but not tech experts
+
+TONE: Friendly, approachable, informative but not boring
+
+EXAMPLE STYLE:
+❌ Bad (too formal): "ယခု နည်းပညာသည် ဉာဏ်ရည်တု ဖြစ်စဉ်များကို အသုံးပြုပါသည်"
+✅ Good (casual): "ဒီနည်းပညာက AI (အထက်တန်းကွန်ပျူတာဦးနှောက်) ကို သုံးတာပါ"
+
+Context: {context}
+
+Text to translate:
+{text}
+
+Casual, easy-to-read Burmese translation:"""
+        
+        try:
+            message = self.client.messages.create(
+                model=config.TRANSLATION['model'],
+                max_tokens=config.TRANSLATION['max_tokens'],
+                temperature=config.TRANSLATION['temperature'],
+                messages=[{"role": "user", "content": prompt}]
+            )
+            
+            translated = message.content[0].text.strip()
+            
+            # Post-process: ensure Unicode and clean up
+            translated = self.post_process_translation(translated)
+            
+            return translated
+        
+        except Exception as e:
+            logger.error(f"API translation error: {e}")
+            return text  # Fallback to original
+    
+    def translate_long_text(self, text: str, chunk_size: int = 2000) -> str:
+        """Translate long text in chunks to stay within token limits"""
+        
+        # If text is short enough, translate directly
+        if len(text) < chunk_size:
+            return self.translate_text(text, context="This is the main article content")
+        
+        # Split into paragraphs
+        paragraphs = text.split('\n\n')
+        
+        # Group paragraphs into chunks
+        chunks = []
+        current_chunk = ""
+        
+        for para in paragraphs:
+            if len(current_chunk) + len(para) < chunk_size:
+                current_chunk += para + '\n\n'
+            else:
+                if current_chunk:
+                    chunks.append(current_chunk.strip())
+                current_chunk = para + '\n\n'
+        
+        if current_chunk:
+            chunks.append(current_chunk.strip())
+        
+        logger.info(f"Translating {len(chunks)} chunks...")
+        
+        # Translate each chunk
+        translated_chunks = []
+        for i, chunk in enumerate(chunks):
+            logger.debug(f"Translating chunk {i+1}/{len(chunks)}")
+            translated = self.translate_text(
+                chunk,
+                context=f"This is part {i+1} of {len(chunks)} of a longer article"
+            )
+            translated_chunks.append(translated)
+            time.sleep(0.5)  # Rate limiting
+        
+        # Join chunks
+        return '\n\n'.join(translated_chunks)
+    
+    def post_process_translation(self, text: str) -> str:
+        """Clean up and validate translation"""
+        
+        # Remove any accidental duplication
+        text = re.sub(r'(\n{3,})', '\n\n', text)
+        
+        # Ensure proper spacing after punctuation
+        text = re.sub(r'([။၊])([^\s])', r'\1 \2', text)
+        
+        # Preserve preserved terms (fix any that got translated)
+        for term in self.preserve_terms:
+            # If the term appears in a weird form, try to fix it
+            # (This is a simple check; more sophisticated matching could be added)
+            if term not in text and term.lower() in text.lower():
+                text = re.sub(re.escape(term.lower()), term, text, flags=re.IGNORECASE)
+        
+        return text.strip()
+    
+    def validate_burmese_text(self, text: str) -> bool:
+        """Check if text contains valid Burmese Unicode"""
+        # Myanmar Unicode range: U+1000 to U+109F
+        burmese_pattern = re.compile(r'[\u1000-\u109F]')
+        return bool(burmese_pattern.search(text))
+
+def run_translator(compiled_articles: list) -> list:
+    """Translate compiled articles to Burmese"""
+    logger.info(f"Starting translator for {len(compiled_articles)} articles...")
+    start_time = time.time()
+    
+    try:
+        translator = BurmeseTranslator()
+        translated_articles = []
+        
+        for i, article in enumerate(compiled_articles, 1):
+            logger.info(f"Translating article {i}/{len(compiled_articles)}")
+            
+            try:
+                translated = translator.translate_article(article)
+                
+                # Validate translation
+                if translator.validate_burmese_text(translated['content_burmese']):
+                    translated_articles.append(translated)
+                    logger.info(f"✓ Translation successful for article {i}")
+                else:
+                    logger.warning(f"✗ Translation validation failed for article {i}")
+                    # Still add it, but flag it
+                    translated_articles.append(translated)
+                
+                time.sleep(1)  # Rate limiting
+            
+            except Exception as e:
+                logger.error(f"Error translating article {i}: {e}")
+                continue
+        
+        duration = int(time.time() - start_time)
+        
+        from database import log_pipeline_stage
+        log_pipeline_stage(
+            stage='translate',
+            status='completed',
+            articles_processed=len(translated_articles),
+            duration=duration
+        )
+        
+        logger.info(f"Translator completed in {duration}s. Articles translated: {len(translated_articles)}")
+        return translated_articles
+    
+    except Exception as e:
+        logger.error(f"Translator failed: {e}")
+        from database import log_pipeline_stage
+        log_pipeline_stage(
+            stage='translate',
+            status='failed',
+            error_message=str(e)
+        )
+        return []
+
+if __name__ == '__main__':
+    from loguru import logger
+    logger.add(config.LOG_FILE, rotation="1 day")
+    
+    # Test translation
+    test_article = {
+        'title': 'OpenAI Releases GPT-5: A New Era of AI',
+        'excerpt': 'OpenAI today announced GPT-5, the next generation of their language model.',
+        'content': '''OpenAI has officially released GPT-5, marking a significant milestone in artificial intelligence development.
+
+## Key Features
+
+The new model includes:
+- 10x more parameters than GPT-4
+- Better reasoning capabilities
+- Multimodal support for video
+- Reduced hallucinations
+
+CEO Sam Altman said, "GPT-5 represents our most advanced AI system yet."
+
+The model will be available to ChatGPT Plus subscribers starting next month.'''
+    }
+    
+    translator = BurmeseTranslator()
+    translated = translator.translate_article(test_article)
+    
+    print("\n=== ORIGINAL ===")
+    print(f"Title: {translated['title']}")
+    print(f"\nContent: {translated['content'][:200]}...")
+    
+    print("\n=== BURMESE ===")
+    print(f"Title: {translated['title_burmese']}")
+    print(f"\nContent: {translated['content_burmese'][:200]}...")