burmddit/backend/publisher.py

# Publisher module - Publishes translated articles to the website

from typing import List, Dict
from slugify import slugify
from loguru import logger
import database
import config
import time
from datetime import datetime, timedelta

class ArticlePublisher:
    def __init__(self):
        pass

    def publish_articles(self, translated_articles: List[Dict]) -> int:
        """Publish translated articles to the website"""
        published_count = 0

        for i, article in enumerate(translated_articles):
            try:
                logger.info(f"Publishing article {i+1}/{len(translated_articles)}: {article['title'][:50]}...")

                # Prepare article data
                article_data = self.prepare_article_for_publishing(article)

                # Insert into database
                article_id = database.insert_article(**article_data)

                if article_id:
                    published_count += 1
                    logger.info(f"✓ Article published successfully (ID: {article_id})")

                    # Mark raw articles as processed
                    for source in article.get('source_articles', []):
                        # This is simplified - in production, track raw_article IDs
                        pass
                else:
                    logger.warning(f"✗ Article already exists or failed to publish")

            except Exception as e:
                logger.error(f"Error publishing article {i+1}: {e}")
                continue

        logger.info(f"Published {published_count}/{len(translated_articles)} articles")
        return published_count

    def prepare_article_for_publishing(self, article: Dict) -> Dict:
        """Prepare article data for database insertion"""

        # Generate slug from Burmese title (romanized) or English title
        slug = self.generate_slug(article.get('title_burmese', article['title']))

        # Ensure excerpt is generated if missing
        excerpt_burmese = article.get('excerpt_burmese') or article['content_burmese'][:200] + '...'
        excerpt = article.get('excerpt') or article['content'][:200] + '...'

        # Calculate reading time (words per minute)
        reading_time = self.calculate_reading_time(article['content_burmese'])

        # Detect category
        category_id = self.detect_category_id(article)

        # Generate meta description
        meta_description = excerpt_burmese[:160]

        # Generate keywords
        meta_keywords = self.extract_keywords(article['title_burmese'] + ' ' + article['content_burmese'])

        # Prepare source articles JSONB
        source_articles = article.get('source_articles', [])

        return {
            'title': article['title'],
            'title_burmese': article['title_burmese'],
            'slug': slug,
            'content': article['content'],
            'content_burmese': article['content_burmese'],
            'excerpt': excerpt,
            'excerpt_burmese': excerpt_burmese,
            'category_id': category_id,
            'featured_image': article.get('featured_image'),
            'images': article.get('images', []),  # 🔥 Multiple images
            'videos': article.get('videos', []),  # 🔥 Videos
            'source_articles': source_articles,
            'meta_description': meta_description,
            'meta_keywords': meta_keywords,
            'reading_time': reading_time,
            'status': config.PUBLISHING['status_default']
        }

    def generate_slug(self, title: str) -> str:
        """Generate URL-friendly slug"""
        # Slugify handles Unicode characters
        slug = slugify(title, max_length=100)

        # If slug is empty (all non-ASCII), use timestamp
        if not slug:
            slug = f"article-{int(time.time())}"

        # Make unique by adding timestamp if needed
        # (Database will handle conflicts with ON CONFLICT DO NOTHING)
        return slug

    def calculate_reading_time(self, text: str) -> int:
        """Calculate reading time in minutes (Burmese text)"""
        # Burmese reading speed: approximately 200-250 characters per minute
        # (slower than English due to script complexity)
        chars = len(text)
        minutes = max(1, round(chars / 225))
        return minutes

    def detect_category_id(self, article: Dict) -> int:
        """Detect and return category ID"""
        # Check if category hint was provided
        if article.get('category_hint'):
            category_slug = article['category_hint'].lower().replace(' & ', '-').replace(' ', '-')
            category = database.get_category_by_slug(category_slug)
            if category:
                return category['id']

        # Fall back to content-based detection
        return database.detect_category(
            article['title'] + ' ' + article.get('title_burmese', ''),
            article['content'][:500]
        )

    def extract_keywords(self, text: str, limit: int = 10) -> List[str]:
        """Extract keywords from text"""
        # Simple keyword extraction (can be improved with NLP)
        # For now, use common AI terms
        keywords = [
            'AI', 'ChatGPT', 'GPT', 'OpenAI', 'Anthropic', 'Claude',
            'Machine Learning', 'Deep Learning', 'Neural Network',
            'LLM', 'Transformer', 'NLP', 'Computer Vision',
            'Automation', 'Generative AI'
        ]

        # Find which keywords appear in the text
        text_lower = text.lower()
        found_keywords = []

        for keyword in keywords:
            if keyword.lower() in text_lower:
                found_keywords.append(keyword)

        return found_keywords[:limit]

    def schedule_publications(self, translated_articles: List[Dict]) -> int:
        """Schedule articles for staggered publication (future enhancement)"""
        # For now, publish all immediately
        # In future: use PUBLISH_AT timestamp to space out publications
        return self.publish_articles(translated_articles)

def run_publisher(translated_articles: List[Dict]) -> int:
    """Main publisher execution"""
    logger.info(f"Starting publisher for {len(translated_articles)} articles...")
    start_time = time.time()

    try:
        publisher = ArticlePublisher()
        published_count = publisher.publish_articles(translated_articles)

        duration = int(time.time() - start_time)
        database.log_pipeline_stage(
            stage='publish',
            status='completed',
            articles_processed=published_count,
            duration=duration
        )

        logger.info(f"Publisher completed in {duration}s. Articles published: {published_count}")
        return published_count

    except Exception as e:
        logger.error(f"Publisher failed: {e}")
        database.log_pipeline_stage(
            stage='publish',
            status='failed',
            error_message=str(e)
        )
        return 0

if __name__ == '__main__':
    from loguru import logger
    logger.add(config.LOG_FILE, rotation="1 day")

    # Test with sample translated article
    test_article = {
        'title': 'OpenAI Releases GPT-5',
        'title_burmese': 'OpenAI က GPT-5 ကို ထုတ်ပြန်လိုက်ပြီ',
        'content': 'Full English content...',
        'content_burmese': 'OpenAI သည် ယနေ့ GPT-5 ကို တရားဝင် ထုတ်ပြန်လိုက်ပြီ ဖြစ်ပါသည်။...',
        'excerpt': 'OpenAI announces GPT-5...',
        'excerpt_burmese': 'OpenAI က GPT-5 ကို ကြေညာလိုက်ပါပြီ...',
        'source_articles': [{'url': 'https://example.com', 'title': 'Test', 'author': 'Test'}]
    }

    count = run_publisher([test_article])
    print(f"Published: {count}")