burmddit/backend/config.py

# Burmddit Configuration

import os
from dotenv import load_dotenv

load_dotenv()

# Database
DATABASE_URL = os.getenv('DATABASE_URL', 'postgresql://localhost/burmddit')

# APIs
ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')  # Optional, for embeddings

# Scraping sources - 🔥 V2 UPDATED with working sources!
SOURCES = {
    # WORKING SOURCES (tested 2026-02-26)
    'techcrunch': {
        'enabled': True,
        'category': 'artificial-intelligence',
        'url': 'https://techcrunch.com/category/artificial-intelligence/feed/',
        'articles_limit': 30
    },
    'mit_tech_review': {
        'enabled': True,
        'url': 'https://www.technologyreview.com/feed/',
        'filter_ai': True,
        'articles_limit': 20
    },
    'wired_ai': {
        'enabled': True,
        'url': 'https://www.wired.com/feed/tag/ai/latest/rss',
        'articles_limit': 15
    },

    # NEW HIGH-QUALITY SOURCES (Priority Tier 1)
    'openai_blog': {
        'enabled': True,
        'url': 'https://openai.com/blog/rss/',
        'articles_limit': 10
    },
    'huggingface': {
        'enabled': True,
        'url': 'https://huggingface.co/blog/feed.xml',
        'articles_limit': 15
    },
    'google_ai': {
        'enabled': True,
        'url': 'http://googleaiblog.blogspot.com/atom.xml',
        'articles_limit': 15
    },
    'marktechpost': {
        'enabled': True,
        'url': 'https://www.marktechpost.com/feed/',
        'articles_limit': 25
    },
    'the_rundown_ai': {
        'enabled': True,
        'url': 'https://rss.beehiiv.com/feeds/2R3C6Bt5wj.xml',
        'articles_limit': 10
    },
    'last_week_ai': {
        'enabled': True,
        'url': 'https://lastweekin.ai/feed',
        'articles_limit': 10
    },
    'ai_news': {
        'enabled': True,
        'url': 'https://www.artificialintelligence-news.com/feed/rss/',
        'articles_limit': 20
    },

    # NEW SOURCES (Priority Tier 2)
    'kdnuggets': {
        'enabled': True,
        'url': 'https://www.kdnuggets.com/feed',
        'articles_limit': 20
    },
    'the_decoder': {
        'enabled': True,
        'url': 'https://the-decoder.com/feed/',
        'articles_limit': 20
    },
    'ai_business': {
        'enabled': True,
        'url': 'https://aibusiness.com/rss.xml',
        'articles_limit': 15
    },
    'unite_ai': {
        'enabled': True,
        'url': 'https://www.unite.ai/feed/',
        'articles_limit': 15
    },
    'simonwillison': {
        'enabled': True,
        'url': 'https://simonwillison.net/atom/everything/',
        'articles_limit': 10
    },
    'latent_space': {
        'enabled': True,
        'url': 'https://www.latent.space/feed',
        'articles_limit': 10
    },

    # BROKEN SOURCES (disabled temporarily)
    'medium': {
        'enabled': False,  # Scraping broken
        'tags': ['artificial-intelligence', 'machine-learning', 'chatgpt'],
        'url_pattern': 'https://medium.com/tag/{tag}/latest',
        'articles_per_tag': 15
    },
    'venturebeat': {
        'enabled': False,  # RSS feed empty
        'url': 'https://venturebeat.com/category/ai/feed/',
        'articles_limit': 25
    },
    'theverge': {
        'enabled': False,  # RSS feed empty
        'url': 'https://www.theverge.com/ai-artificial-intelligence/rss/index.xml',
        'articles_limit': 20
    },
    'arstechnica': {
        'enabled': False,  # Needs testing
        'url': 'https://arstechnica.com/tag/artificial-intelligence/feed/',
        'articles_limit': 15
    },
    'hackernews': {
        'enabled': False,  # Needs testing
        'url': 'https://hnrss.org/newest?q=AI+OR+ChatGPT+OR+OpenAI',
        'articles_limit': 30
    }
}

# Content pipeline settings
PIPELINE = {
    'articles_per_day': 30,  # 🔥 INCREASED! More content = more traffic
    'min_article_length': 600,  # Shorter, easier to read
    'max_article_length': 1000,  # Keep it concise
    'sources_per_article': 3,  # How many articles to compile into one
    'clustering_threshold': 0.6,  # Lower threshold = more diverse topics
    'research_time_minutes': 90,  # Spend 1.5 hours researching daily
}

# Category mapping (keyword-based)
CATEGORY_KEYWORDS = {
    'AI News': ['news', 'announcement', 'report', 'industry', 'company', 'funding', 'release'],
    'AI Tutorials': ['how to', 'tutorial', 'guide', 'step by step', 'learn', 'beginners', 'course'],
    'Tips & Tricks': ['tips', 'tricks', 'hacks', 'productivity', 'best practices', 'optimize', 'improve'],
    'Upcoming Releases': ['upcoming', 'soon', 'preview', 'roadmap', 'future', 'expected', 'announce']
}

# Translation settings
TRANSLATION = {
    'model': os.getenv('CLAUDE_MODEL', 'claude-3-haiku-20240307'),
    'max_tokens': 4000,
    'temperature': 0.5,  # Higher = more natural, casual translation
    'preserve_terms': [  # Technical terms to keep in English
        'AI', 'ChatGPT', 'GPT', 'Claude', 'API', 'ML', 'NLP',
        'LLM', 'Transformer', 'Neural Network', 'Python', 'GitHub',
        'DeepSeek', 'OpenAI', 'Anthropic', 'Google', 'Meta'
    ],
    'style': 'casual',  # Casual, conversational tone
    'target_audience': 'general',  # Not just tech experts
    'simplify_jargon': True,  # Explain technical terms simply
}

# Publishing settings
PUBLISHING = {
    'status_default': 'published',  # or 'draft' for manual review
    'publish_interval_hours': 1,  # Space out publications
    'featured_image_required': False,
    'auto_generate_excerpt': True,
    'excerpt_length': 200,  # characters
    'require_featured_image': True,  # Every article needs an image
    'extract_videos': True,  # Extract YouTube/video embeds
    'max_images_per_article': 5,  # Include multiple images
    'image_fallback': 'generate'  # If no image, generate AI image
}

# SEO settings
SEO = {
    'meta_description_length': 160,
    'keywords_per_article': 10,
    'auto_generate_slug': True
}

# Burmese font settings
BURMESE = {
    'font_family': 'Pyidaungsu',
    'fallback_fonts': ['Noto Sans Myanmar', 'Myanmar Text'],
    'unicode_range': 'U+1000-109F'  # Myanmar Unicode range
}

# Admin
ADMIN_PASSWORD = os.getenv('ADMIN_PASSWORD', 'change_me_in_production')

# Logging
LOG_LEVEL = os.getenv('LOG_LEVEL', 'INFO')
LOG_FILE = 'burmddit_pipeline.log'

# Rate limiting
RATE_LIMITS = {
    'requests_per_minute': 30,
    'anthropic_rpm': 50,
    'delay_between_requests': 2  # seconds
}

# Retry settings
RETRY = {
    'max_attempts': 3,
    'backoff_factor': 2,
    'timeout': 30  # seconds
}