forked from minzeyaphyo/burmddit
Add web admin features + fix scraper & translator
Frontend changes: - Add /admin dashboard for article management - Add AdminButton component (Alt+Shift+A on articles) - Add /api/admin/article API endpoints Backend improvements: - scraper_v2.py: Multi-layer fallback extraction (newspaper → trafilatura → readability) - translator_v2.py: Better chunking, repetition detection, validation - admin_tools.py: CLI admin commands - test_scraper.py: Individual source testing Docs: - WEB-ADMIN-GUIDE.md: Web admin usage - ADMIN-GUIDE.md: CLI admin usage - SCRAPER-IMPROVEMENT-PLAN.md: Scraper fixes details - TRANSLATION-FIX.md: Translation improvements - ADMIN-FEATURES-SUMMARY.md: Implementation summary Fixes: - Article scraping from 0 → 96+ articles working - Translation quality issues (repetition, truncation) - Added 13 new RSS sources
This commit is contained in:
@@ -12,35 +12,19 @@ DATABASE_URL = os.getenv('DATABASE_URL', 'postgresql://localhost/burmddit')
|
||||
ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY')
|
||||
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') # Optional, for embeddings
|
||||
|
||||
# Scraping sources - 🔥 EXPANDED for more content!
|
||||
# Scraping sources - 🔥 V2 UPDATED with working sources!
|
||||
SOURCES = {
|
||||
'medium': {
|
||||
'enabled': True,
|
||||
'tags': ['artificial-intelligence', 'machine-learning', 'chatgpt', 'ai-tools',
|
||||
'generative-ai', 'deeplearning', 'prompt-engineering', 'ai-news'],
|
||||
'url_pattern': 'https://medium.com/tag/{tag}/latest',
|
||||
'articles_per_tag': 15 # Increased from 10
|
||||
},
|
||||
# WORKING SOURCES (tested 2026-02-26)
|
||||
'techcrunch': {
|
||||
'enabled': True,
|
||||
'category': 'artificial-intelligence',
|
||||
'url': 'https://techcrunch.com/category/artificial-intelligence/feed/',
|
||||
'articles_limit': 30 # Increased from 20
|
||||
},
|
||||
'venturebeat': {
|
||||
'enabled': True,
|
||||
'url': 'https://venturebeat.com/category/ai/feed/',
|
||||
'articles_limit': 25 # Increased from 15
|
||||
'articles_limit': 30
|
||||
},
|
||||
'mit_tech_review': {
|
||||
'enabled': True,
|
||||
'url': 'https://www.technologyreview.com/feed/',
|
||||
'filter_ai': True,
|
||||
'articles_limit': 20 # Increased from 10
|
||||
},
|
||||
'theverge': {
|
||||
'enabled': True,
|
||||
'url': 'https://www.theverge.com/ai-artificial-intelligence/rss/index.xml',
|
||||
'articles_limit': 20
|
||||
},
|
||||
'wired_ai': {
|
||||
@@ -48,13 +32,100 @@ SOURCES = {
|
||||
'url': 'https://www.wired.com/feed/tag/ai/latest/rss',
|
||||
'articles_limit': 15
|
||||
},
|
||||
'arstechnica': {
|
||||
|
||||
# NEW HIGH-QUALITY SOURCES (Priority Tier 1)
|
||||
'openai_blog': {
|
||||
'enabled': True,
|
||||
'url': 'https://openai.com/blog/rss/',
|
||||
'articles_limit': 10
|
||||
},
|
||||
'huggingface': {
|
||||
'enabled': True,
|
||||
'url': 'https://huggingface.co/blog/feed.xml',
|
||||
'articles_limit': 15
|
||||
},
|
||||
'google_ai': {
|
||||
'enabled': True,
|
||||
'url': 'http://googleaiblog.blogspot.com/atom.xml',
|
||||
'articles_limit': 15
|
||||
},
|
||||
'marktechpost': {
|
||||
'enabled': True,
|
||||
'url': 'https://www.marktechpost.com/feed/',
|
||||
'articles_limit': 25
|
||||
},
|
||||
'the_rundown_ai': {
|
||||
'enabled': True,
|
||||
'url': 'https://rss.beehiiv.com/feeds/2R3C6Bt5wj.xml',
|
||||
'articles_limit': 10
|
||||
},
|
||||
'last_week_ai': {
|
||||
'enabled': True,
|
||||
'url': 'https://lastweekin.ai/feed',
|
||||
'articles_limit': 10
|
||||
},
|
||||
'ai_news': {
|
||||
'enabled': True,
|
||||
'url': 'https://www.artificialintelligence-news.com/feed/rss/',
|
||||
'articles_limit': 20
|
||||
},
|
||||
|
||||
# NEW SOURCES (Priority Tier 2)
|
||||
'kdnuggets': {
|
||||
'enabled': True,
|
||||
'url': 'https://www.kdnuggets.com/feed',
|
||||
'articles_limit': 20
|
||||
},
|
||||
'the_decoder': {
|
||||
'enabled': True,
|
||||
'url': 'https://the-decoder.com/feed/',
|
||||
'articles_limit': 20
|
||||
},
|
||||
'ai_business': {
|
||||
'enabled': True,
|
||||
'url': 'https://aibusiness.com/rss.xml',
|
||||
'articles_limit': 15
|
||||
},
|
||||
'unite_ai': {
|
||||
'enabled': True,
|
||||
'url': 'https://www.unite.ai/feed/',
|
||||
'articles_limit': 15
|
||||
},
|
||||
'simonwillison': {
|
||||
'enabled': True,
|
||||
'url': 'https://simonwillison.net/atom/everything/',
|
||||
'articles_limit': 10
|
||||
},
|
||||
'latent_space': {
|
||||
'enabled': True,
|
||||
'url': 'https://www.latent.space/feed',
|
||||
'articles_limit': 10
|
||||
},
|
||||
|
||||
# BROKEN SOURCES (disabled temporarily)
|
||||
'medium': {
|
||||
'enabled': False, # Scraping broken
|
||||
'tags': ['artificial-intelligence', 'machine-learning', 'chatgpt'],
|
||||
'url_pattern': 'https://medium.com/tag/{tag}/latest',
|
||||
'articles_per_tag': 15
|
||||
},
|
||||
'venturebeat': {
|
||||
'enabled': False, # RSS feed empty
|
||||
'url': 'https://venturebeat.com/category/ai/feed/',
|
||||
'articles_limit': 25
|
||||
},
|
||||
'theverge': {
|
||||
'enabled': False, # RSS feed empty
|
||||
'url': 'https://www.theverge.com/ai-artificial-intelligence/rss/index.xml',
|
||||
'articles_limit': 20
|
||||
},
|
||||
'arstechnica': {
|
||||
'enabled': False, # Needs testing
|
||||
'url': 'https://arstechnica.com/tag/artificial-intelligence/feed/',
|
||||
'articles_limit': 15
|
||||
},
|
||||
'hackernews': {
|
||||
'enabled': True,
|
||||
'enabled': False, # Needs testing
|
||||
'url': 'https://hnrss.org/newest?q=AI+OR+ChatGPT+OR+OpenAI',
|
||||
'articles_limit': 30
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user