forked from minzeyaphyo/burmddit
Initial Burmddit deployment - AI news aggregator in Burmese
This commit is contained in:
199
backend/publisher.py
Normal file
199
backend/publisher.py
Normal file
@@ -0,0 +1,199 @@
|
||||
# Publisher module - Publishes translated articles to the website
|
||||
|
||||
from typing import List, Dict
|
||||
from slugify import slugify
|
||||
from loguru import logger
|
||||
import database
|
||||
import config
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
class ArticlePublisher:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def publish_articles(self, translated_articles: List[Dict]) -> int:
|
||||
"""Publish translated articles to the website"""
|
||||
published_count = 0
|
||||
|
||||
for i, article in enumerate(translated_articles):
|
||||
try:
|
||||
logger.info(f"Publishing article {i+1}/{len(translated_articles)}: {article['title'][:50]}...")
|
||||
|
||||
# Prepare article data
|
||||
article_data = self.prepare_article_for_publishing(article)
|
||||
|
||||
# Insert into database
|
||||
article_id = database.insert_article(**article_data)
|
||||
|
||||
if article_id:
|
||||
published_count += 1
|
||||
logger.info(f"✓ Article published successfully (ID: {article_id})")
|
||||
|
||||
# Mark raw articles as processed
|
||||
for source in article.get('source_articles', []):
|
||||
# This is simplified - in production, track raw_article IDs
|
||||
pass
|
||||
else:
|
||||
logger.warning(f"✗ Article already exists or failed to publish")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error publishing article {i+1}: {e}")
|
||||
continue
|
||||
|
||||
logger.info(f"Published {published_count}/{len(translated_articles)} articles")
|
||||
return published_count
|
||||
|
||||
def prepare_article_for_publishing(self, article: Dict) -> Dict:
|
||||
"""Prepare article data for database insertion"""
|
||||
|
||||
# Generate slug from Burmese title (romanized) or English title
|
||||
slug = self.generate_slug(article.get('title_burmese', article['title']))
|
||||
|
||||
# Ensure excerpt is generated if missing
|
||||
excerpt_burmese = article.get('excerpt_burmese') or article['content_burmese'][:200] + '...'
|
||||
excerpt = article.get('excerpt') or article['content'][:200] + '...'
|
||||
|
||||
# Calculate reading time (words per minute)
|
||||
reading_time = self.calculate_reading_time(article['content_burmese'])
|
||||
|
||||
# Detect category
|
||||
category_id = self.detect_category_id(article)
|
||||
|
||||
# Generate meta description
|
||||
meta_description = excerpt_burmese[:160]
|
||||
|
||||
# Generate keywords
|
||||
meta_keywords = self.extract_keywords(article['title_burmese'] + ' ' + article['content_burmese'])
|
||||
|
||||
# Prepare source articles JSONB
|
||||
source_articles = article.get('source_articles', [])
|
||||
|
||||
return {
|
||||
'title': article['title'],
|
||||
'title_burmese': article['title_burmese'],
|
||||
'slug': slug,
|
||||
'content': article['content'],
|
||||
'content_burmese': article['content_burmese'],
|
||||
'excerpt': excerpt,
|
||||
'excerpt_burmese': excerpt_burmese,
|
||||
'category_id': category_id,
|
||||
'featured_image': article.get('featured_image'),
|
||||
'images': article.get('images', []), # 🔥 Multiple images
|
||||
'videos': article.get('videos', []), # 🔥 Videos
|
||||
'source_articles': source_articles,
|
||||
'meta_description': meta_description,
|
||||
'meta_keywords': meta_keywords,
|
||||
'reading_time': reading_time,
|
||||
'status': config.PUBLISHING['status_default']
|
||||
}
|
||||
|
||||
def generate_slug(self, title: str) -> str:
|
||||
"""Generate URL-friendly slug"""
|
||||
# Slugify handles Unicode characters
|
||||
slug = slugify(title, max_length=100)
|
||||
|
||||
# If slug is empty (all non-ASCII), use timestamp
|
||||
if not slug:
|
||||
slug = f"article-{int(time.time())}"
|
||||
|
||||
# Make unique by adding timestamp if needed
|
||||
# (Database will handle conflicts with ON CONFLICT DO NOTHING)
|
||||
return slug
|
||||
|
||||
def calculate_reading_time(self, text: str) -> int:
|
||||
"""Calculate reading time in minutes (Burmese text)"""
|
||||
# Burmese reading speed: approximately 200-250 characters per minute
|
||||
# (slower than English due to script complexity)
|
||||
chars = len(text)
|
||||
minutes = max(1, round(chars / 225))
|
||||
return minutes
|
||||
|
||||
def detect_category_id(self, article: Dict) -> int:
|
||||
"""Detect and return category ID"""
|
||||
# Check if category hint was provided
|
||||
if article.get('category_hint'):
|
||||
category_slug = article['category_hint'].lower().replace(' & ', '-').replace(' ', '-')
|
||||
category = database.get_category_by_slug(category_slug)
|
||||
if category:
|
||||
return category['id']
|
||||
|
||||
# Fall back to content-based detection
|
||||
return database.detect_category(
|
||||
article['title'] + ' ' + article.get('title_burmese', ''),
|
||||
article['content'][:500]
|
||||
)
|
||||
|
||||
def extract_keywords(self, text: str, limit: int = 10) -> List[str]:
|
||||
"""Extract keywords from text"""
|
||||
# Simple keyword extraction (can be improved with NLP)
|
||||
# For now, use common AI terms
|
||||
keywords = [
|
||||
'AI', 'ChatGPT', 'GPT', 'OpenAI', 'Anthropic', 'Claude',
|
||||
'Machine Learning', 'Deep Learning', 'Neural Network',
|
||||
'LLM', 'Transformer', 'NLP', 'Computer Vision',
|
||||
'Automation', 'Generative AI'
|
||||
]
|
||||
|
||||
# Find which keywords appear in the text
|
||||
text_lower = text.lower()
|
||||
found_keywords = []
|
||||
|
||||
for keyword in keywords:
|
||||
if keyword.lower() in text_lower:
|
||||
found_keywords.append(keyword)
|
||||
|
||||
return found_keywords[:limit]
|
||||
|
||||
def schedule_publications(self, translated_articles: List[Dict]) -> int:
|
||||
"""Schedule articles for staggered publication (future enhancement)"""
|
||||
# For now, publish all immediately
|
||||
# In future: use PUBLISH_AT timestamp to space out publications
|
||||
return self.publish_articles(translated_articles)
|
||||
|
||||
def run_publisher(translated_articles: List[Dict]) -> int:
|
||||
"""Main publisher execution"""
|
||||
logger.info(f"Starting publisher for {len(translated_articles)} articles...")
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
publisher = ArticlePublisher()
|
||||
published_count = publisher.publish_articles(translated_articles)
|
||||
|
||||
duration = int(time.time() - start_time)
|
||||
database.log_pipeline_stage(
|
||||
stage='publish',
|
||||
status='completed',
|
||||
articles_processed=published_count,
|
||||
duration=duration
|
||||
)
|
||||
|
||||
logger.info(f"Publisher completed in {duration}s. Articles published: {published_count}")
|
||||
return published_count
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Publisher failed: {e}")
|
||||
database.log_pipeline_stage(
|
||||
stage='publish',
|
||||
status='failed',
|
||||
error_message=str(e)
|
||||
)
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
from loguru import logger
|
||||
logger.add(config.LOG_FILE, rotation="1 day")
|
||||
|
||||
# Test with sample translated article
|
||||
test_article = {
|
||||
'title': 'OpenAI Releases GPT-5',
|
||||
'title_burmese': 'OpenAI က GPT-5 ကို ထုတ်ပြန်လိုက်ပြီ',
|
||||
'content': 'Full English content...',
|
||||
'content_burmese': 'OpenAI သည် ယနေ့ GPT-5 ကို တရားဝင် ထုတ်ပြန်လိုက်ပြီ ဖြစ်ပါသည်။...',
|
||||
'excerpt': 'OpenAI announces GPT-5...',
|
||||
'excerpt_burmese': 'OpenAI က GPT-5 ကို ကြေညာလိုက်ပါပြီ...',
|
||||
'source_articles': [{'url': 'https://example.com', 'title': 'Test', 'author': 'Test'}]
|
||||
}
|
||||
|
||||
count = run_publisher([test_article])
|
||||
print(f"Published: {count}")
|
||||
Reference in New Issue
Block a user