forked from minzeyaphyo/burmddit
Frontend changes: - Add /admin dashboard for article management - Add AdminButton component (Alt+Shift+A on articles) - Add /api/admin/article API endpoints Backend improvements: - scraper_v2.py: Multi-layer fallback extraction (newspaper → trafilatura → readability) - translator_v2.py: Better chunking, repetition detection, validation - admin_tools.py: CLI admin commands - test_scraper.py: Individual source testing Docs: - WEB-ADMIN-GUIDE.md: Web admin usage - ADMIN-GUIDE.md: CLI admin usage - SCRAPER-IMPROVEMENT-PLAN.md: Scraper fixes details - TRANSLATION-FIX.md: Translation improvements - ADMIN-FEATURES-SUMMARY.md: Implementation summary Fixes: - Article scraping from 0 → 96+ articles working - Translation quality issues (repetition, truncation) - Added 13 new RSS sources
214 lines
6.4 KiB
Python
214 lines
6.4 KiB
Python
# Burmddit Configuration
|
|
|
|
import os
|
|
from dotenv import load_dotenv
|
|
|
|
load_dotenv()
|
|
|
|
# Database
|
|
DATABASE_URL = os.getenv('DATABASE_URL', 'postgresql://localhost/burmddit')
|
|
|
|
# APIs
|
|
ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY')
|
|
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') # Optional, for embeddings
|
|
|
|
# Scraping sources - 🔥 V2 UPDATED with working sources!
|
|
SOURCES = {
|
|
# WORKING SOURCES (tested 2026-02-26)
|
|
'techcrunch': {
|
|
'enabled': True,
|
|
'category': 'artificial-intelligence',
|
|
'url': 'https://techcrunch.com/category/artificial-intelligence/feed/',
|
|
'articles_limit': 30
|
|
},
|
|
'mit_tech_review': {
|
|
'enabled': True,
|
|
'url': 'https://www.technologyreview.com/feed/',
|
|
'filter_ai': True,
|
|
'articles_limit': 20
|
|
},
|
|
'wired_ai': {
|
|
'enabled': True,
|
|
'url': 'https://www.wired.com/feed/tag/ai/latest/rss',
|
|
'articles_limit': 15
|
|
},
|
|
|
|
# NEW HIGH-QUALITY SOURCES (Priority Tier 1)
|
|
'openai_blog': {
|
|
'enabled': True,
|
|
'url': 'https://openai.com/blog/rss/',
|
|
'articles_limit': 10
|
|
},
|
|
'huggingface': {
|
|
'enabled': True,
|
|
'url': 'https://huggingface.co/blog/feed.xml',
|
|
'articles_limit': 15
|
|
},
|
|
'google_ai': {
|
|
'enabled': True,
|
|
'url': 'http://googleaiblog.blogspot.com/atom.xml',
|
|
'articles_limit': 15
|
|
},
|
|
'marktechpost': {
|
|
'enabled': True,
|
|
'url': 'https://www.marktechpost.com/feed/',
|
|
'articles_limit': 25
|
|
},
|
|
'the_rundown_ai': {
|
|
'enabled': True,
|
|
'url': 'https://rss.beehiiv.com/feeds/2R3C6Bt5wj.xml',
|
|
'articles_limit': 10
|
|
},
|
|
'last_week_ai': {
|
|
'enabled': True,
|
|
'url': 'https://lastweekin.ai/feed',
|
|
'articles_limit': 10
|
|
},
|
|
'ai_news': {
|
|
'enabled': True,
|
|
'url': 'https://www.artificialintelligence-news.com/feed/rss/',
|
|
'articles_limit': 20
|
|
},
|
|
|
|
# NEW SOURCES (Priority Tier 2)
|
|
'kdnuggets': {
|
|
'enabled': True,
|
|
'url': 'https://www.kdnuggets.com/feed',
|
|
'articles_limit': 20
|
|
},
|
|
'the_decoder': {
|
|
'enabled': True,
|
|
'url': 'https://the-decoder.com/feed/',
|
|
'articles_limit': 20
|
|
},
|
|
'ai_business': {
|
|
'enabled': True,
|
|
'url': 'https://aibusiness.com/rss.xml',
|
|
'articles_limit': 15
|
|
},
|
|
'unite_ai': {
|
|
'enabled': True,
|
|
'url': 'https://www.unite.ai/feed/',
|
|
'articles_limit': 15
|
|
},
|
|
'simonwillison': {
|
|
'enabled': True,
|
|
'url': 'https://simonwillison.net/atom/everything/',
|
|
'articles_limit': 10
|
|
},
|
|
'latent_space': {
|
|
'enabled': True,
|
|
'url': 'https://www.latent.space/feed',
|
|
'articles_limit': 10
|
|
},
|
|
|
|
# BROKEN SOURCES (disabled temporarily)
|
|
'medium': {
|
|
'enabled': False, # Scraping broken
|
|
'tags': ['artificial-intelligence', 'machine-learning', 'chatgpt'],
|
|
'url_pattern': 'https://medium.com/tag/{tag}/latest',
|
|
'articles_per_tag': 15
|
|
},
|
|
'venturebeat': {
|
|
'enabled': False, # RSS feed empty
|
|
'url': 'https://venturebeat.com/category/ai/feed/',
|
|
'articles_limit': 25
|
|
},
|
|
'theverge': {
|
|
'enabled': False, # RSS feed empty
|
|
'url': 'https://www.theverge.com/ai-artificial-intelligence/rss/index.xml',
|
|
'articles_limit': 20
|
|
},
|
|
'arstechnica': {
|
|
'enabled': False, # Needs testing
|
|
'url': 'https://arstechnica.com/tag/artificial-intelligence/feed/',
|
|
'articles_limit': 15
|
|
},
|
|
'hackernews': {
|
|
'enabled': False, # Needs testing
|
|
'url': 'https://hnrss.org/newest?q=AI+OR+ChatGPT+OR+OpenAI',
|
|
'articles_limit': 30
|
|
}
|
|
}
|
|
|
|
# Content pipeline settings
|
|
PIPELINE = {
|
|
'articles_per_day': 30, # 🔥 INCREASED! More content = more traffic
|
|
'min_article_length': 600, # Shorter, easier to read
|
|
'max_article_length': 1000, # Keep it concise
|
|
'sources_per_article': 3, # How many articles to compile into one
|
|
'clustering_threshold': 0.6, # Lower threshold = more diverse topics
|
|
'research_time_minutes': 90, # Spend 1.5 hours researching daily
|
|
}
|
|
|
|
# Category mapping (keyword-based)
|
|
CATEGORY_KEYWORDS = {
|
|
'AI News': ['news', 'announcement', 'report', 'industry', 'company', 'funding', 'release'],
|
|
'AI Tutorials': ['how to', 'tutorial', 'guide', 'step by step', 'learn', 'beginners', 'course'],
|
|
'Tips & Tricks': ['tips', 'tricks', 'hacks', 'productivity', 'best practices', 'optimize', 'improve'],
|
|
'Upcoming Releases': ['upcoming', 'soon', 'preview', 'roadmap', 'future', 'expected', 'announce']
|
|
}
|
|
|
|
# Translation settings
|
|
TRANSLATION = {
|
|
'model': os.getenv('CLAUDE_MODEL', 'claude-3-haiku-20240307'),
|
|
'max_tokens': 4000,
|
|
'temperature': 0.5, # Higher = more natural, casual translation
|
|
'preserve_terms': [ # Technical terms to keep in English
|
|
'AI', 'ChatGPT', 'GPT', 'Claude', 'API', 'ML', 'NLP',
|
|
'LLM', 'Transformer', 'Neural Network', 'Python', 'GitHub',
|
|
'DeepSeek', 'OpenAI', 'Anthropic', 'Google', 'Meta'
|
|
],
|
|
'style': 'casual', # Casual, conversational tone
|
|
'target_audience': 'general', # Not just tech experts
|
|
'simplify_jargon': True, # Explain technical terms simply
|
|
}
|
|
|
|
# Publishing settings
|
|
PUBLISHING = {
|
|
'status_default': 'published', # or 'draft' for manual review
|
|
'publish_interval_hours': 1, # Space out publications
|
|
'featured_image_required': False,
|
|
'auto_generate_excerpt': True,
|
|
'excerpt_length': 200, # characters
|
|
'require_featured_image': True, # Every article needs an image
|
|
'extract_videos': True, # Extract YouTube/video embeds
|
|
'max_images_per_article': 5, # Include multiple images
|
|
'image_fallback': 'generate' # If no image, generate AI image
|
|
}
|
|
|
|
# SEO settings
|
|
SEO = {
|
|
'meta_description_length': 160,
|
|
'keywords_per_article': 10,
|
|
'auto_generate_slug': True
|
|
}
|
|
|
|
# Burmese font settings
|
|
BURMESE = {
|
|
'font_family': 'Pyidaungsu',
|
|
'fallback_fonts': ['Noto Sans Myanmar', 'Myanmar Text'],
|
|
'unicode_range': 'U+1000-109F' # Myanmar Unicode range
|
|
}
|
|
|
|
# Admin
|
|
ADMIN_PASSWORD = os.getenv('ADMIN_PASSWORD', 'change_me_in_production')
|
|
|
|
# Logging
|
|
LOG_LEVEL = os.getenv('LOG_LEVEL', 'INFO')
|
|
LOG_FILE = 'burmddit_pipeline.log'
|
|
|
|
# Rate limiting
|
|
RATE_LIMITS = {
|
|
'requests_per_minute': 30,
|
|
'anthropic_rpm': 50,
|
|
'delay_between_requests': 2 # seconds
|
|
}
|
|
|
|
# Retry settings
|
|
RETRY = {
|
|
'max_attempts': 3,
|
|
'backoff_factor': 2,
|
|
'timeout': 30 # seconds
|
|
}
|