Initial Burmddit deployment - AI news aggregator in Burmese

2026-02-19 02:52:58 +00:00
commit dddb86ea94
27 changed files with 5039 additions and 0 deletions
--- a/backend/publisher.py
+++ b/backend/publisher.py
@@ -0,0 +1,199 @@
+# Publisher module - Publishes translated articles to the website
+
+from typing import List, Dict
+from slugify import slugify
+from loguru import logger
+import database
+import config
+import time
+from datetime import datetime, timedelta
+
+class ArticlePublisher:
+    def __init__(self):
+        pass
+    
+    def publish_articles(self, translated_articles: List[Dict]) -> int:
+        """Publish translated articles to the website"""
+        published_count = 0
+        
+        for i, article in enumerate(translated_articles):
+            try:
+                logger.info(f"Publishing article {i+1}/{len(translated_articles)}: {article['title'][:50]}...")
+                
+                # Prepare article data
+                article_data = self.prepare_article_for_publishing(article)
+                
+                # Insert into database
+                article_id = database.insert_article(**article_data)
+                
+                if article_id:
+                    published_count += 1
+                    logger.info(f"✓ Article published successfully (ID: {article_id})")
+                    
+                    # Mark raw articles as processed
+                    for source in article.get('source_articles', []):
+                        # This is simplified - in production, track raw_article IDs
+                        pass
+                else:
+                    logger.warning(f"✗ Article already exists or failed to publish")
+            
+            except Exception as e:
+                logger.error(f"Error publishing article {i+1}: {e}")
+                continue
+        
+        logger.info(f"Published {published_count}/{len(translated_articles)} articles")
+        return published_count
+    
+    def prepare_article_for_publishing(self, article: Dict) -> Dict:
+        """Prepare article data for database insertion"""
+        
+        # Generate slug from Burmese title (romanized) or English title
+        slug = self.generate_slug(article.get('title_burmese', article['title']))
+        
+        # Ensure excerpt is generated if missing
+        excerpt_burmese = article.get('excerpt_burmese') or article['content_burmese'][:200] + '...'
+        excerpt = article.get('excerpt') or article['content'][:200] + '...'
+        
+        # Calculate reading time (words per minute)
+        reading_time = self.calculate_reading_time(article['content_burmese'])
+        
+        # Detect category
+        category_id = self.detect_category_id(article)
+        
+        # Generate meta description
+        meta_description = excerpt_burmese[:160]
+        
+        # Generate keywords
+        meta_keywords = self.extract_keywords(article['title_burmese'] + ' ' + article['content_burmese'])
+        
+        # Prepare source articles JSONB
+        source_articles = article.get('source_articles', [])
+        
+        return {
+            'title': article['title'],
+            'title_burmese': article['title_burmese'],
+            'slug': slug,
+            'content': article['content'],
+            'content_burmese': article['content_burmese'],
+            'excerpt': excerpt,
+            'excerpt_burmese': excerpt_burmese,
+            'category_id': category_id,
+            'featured_image': article.get('featured_image'),
+            'images': article.get('images', []),  # 🔥 Multiple images
+            'videos': article.get('videos', []),  # 🔥 Videos
+            'source_articles': source_articles,
+            'meta_description': meta_description,
+            'meta_keywords': meta_keywords,
+            'reading_time': reading_time,
+            'status': config.PUBLISHING['status_default']
+        }
+    
+    def generate_slug(self, title: str) -> str:
+        """Generate URL-friendly slug"""
+        # Slugify handles Unicode characters
+        slug = slugify(title, max_length=100)
+        
+        # If slug is empty (all non-ASCII), use timestamp
+        if not slug:
+            slug = f"article-{int(time.time())}"
+        
+        # Make unique by adding timestamp if needed
+        # (Database will handle conflicts with ON CONFLICT DO NOTHING)
+        return slug
+    
+    def calculate_reading_time(self, text: str) -> int:
+        """Calculate reading time in minutes (Burmese text)"""
+        # Burmese reading speed: approximately 200-250 characters per minute
+        # (slower than English due to script complexity)
+        chars = len(text)
+        minutes = max(1, round(chars / 225))
+        return minutes
+    
+    def detect_category_id(self, article: Dict) -> int:
+        """Detect and return category ID"""
+        # Check if category hint was provided
+        if article.get('category_hint'):
+            category_slug = article['category_hint'].lower().replace(' & ', '-').replace(' ', '-')
+            category = database.get_category_by_slug(category_slug)
+            if category:
+                return category['id']
+        
+        # Fall back to content-based detection
+        return database.detect_category(
+            article['title'] + ' ' + article.get('title_burmese', ''),
+            article['content'][:500]
+        )
+    
+    def extract_keywords(self, text: str, limit: int = 10) -> List[str]:
+        """Extract keywords from text"""
+        # Simple keyword extraction (can be improved with NLP)
+        # For now, use common AI terms
+        keywords = [
+            'AI', 'ChatGPT', 'GPT', 'OpenAI', 'Anthropic', 'Claude',
+            'Machine Learning', 'Deep Learning', 'Neural Network',
+            'LLM', 'Transformer', 'NLP', 'Computer Vision',
+            'Automation', 'Generative AI'
+        ]
+        
+        # Find which keywords appear in the text
+        text_lower = text.lower()
+        found_keywords = []
+        
+        for keyword in keywords:
+            if keyword.lower() in text_lower:
+                found_keywords.append(keyword)
+        
+        return found_keywords[:limit]
+    
+    def schedule_publications(self, translated_articles: List[Dict]) -> int:
+        """Schedule articles for staggered publication (future enhancement)"""
+        # For now, publish all immediately
+        # In future: use PUBLISH_AT timestamp to space out publications
+        return self.publish_articles(translated_articles)
+
+def run_publisher(translated_articles: List[Dict]) -> int:
+    """Main publisher execution"""
+    logger.info(f"Starting publisher for {len(translated_articles)} articles...")
+    start_time = time.time()
+    
+    try:
+        publisher = ArticlePublisher()
+        published_count = publisher.publish_articles(translated_articles)
+        
+        duration = int(time.time() - start_time)
+        database.log_pipeline_stage(
+            stage='publish',
+            status='completed',
+            articles_processed=published_count,
+            duration=duration
+        )
+        
+        logger.info(f"Publisher completed in {duration}s. Articles published: {published_count}")
+        return published_count
+    
+    except Exception as e:
+        logger.error(f"Publisher failed: {e}")
+        database.log_pipeline_stage(
+            stage='publish',
+            status='failed',
+            error_message=str(e)
+        )
+        return 0
+
+if __name__ == '__main__':
+    from loguru import logger
+    logger.add(config.LOG_FILE, rotation="1 day")
+    
+    # Test with sample translated article
+    test_article = {
+        'title': 'OpenAI Releases GPT-5',
+        'title_burmese': 'OpenAI က GPT-5 ကို ထုတ်ပြန်လိုက်ပြီ',
+        'content': 'Full English content...',
+        'content_burmese': 'OpenAI သည် ယနေ့ GPT-5 ကို တရားဝင် ထုတ်ပြန်လိုက်ပြီ ဖြစ်ပါသည်။...',
+        'excerpt': 'OpenAI announces GPT-5...',
+        'excerpt_burmese': 'OpenAI က GPT-5 ကို ကြေညာလိုက်ပါပြီ...',
+        'source_articles': [{'url': 'https://example.com', 'title': 'Test', 'author': 'Test'}]
+    }
+    
+    count = run_publisher([test_article])
+    print(f"Published: {count}")