forked from minzeyaphyo/burmddit
Frontend changes: - Add /admin dashboard for article management - Add AdminButton component (Alt+Shift+A on articles) - Add /api/admin/article API endpoints Backend improvements: - scraper_v2.py: Multi-layer fallback extraction (newspaper → trafilatura → readability) - translator_v2.py: Better chunking, repetition detection, validation - admin_tools.py: CLI admin commands - test_scraper.py: Individual source testing Docs: - WEB-ADMIN-GUIDE.md: Web admin usage - ADMIN-GUIDE.md: CLI admin usage - SCRAPER-IMPROVEMENT-PLAN.md: Scraper fixes details - TRANSLATION-FIX.md: Translation improvements - ADMIN-FEATURES-SUMMARY.md: Implementation summary Fixes: - Article scraping from 0 → 96+ articles working - Translation quality issues (repetition, truncation) - Added 13 new RSS sources
153 lines
5.5 KiB
Python
Executable File
153 lines
5.5 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Test individual sources with the new scraper
|
|
Usage: python3 test_scraper.py [--source SOURCE_NAME] [--limit N]
|
|
"""
|
|
|
|
import sys
|
|
import argparse
|
|
from loguru import logger
|
|
import config
|
|
|
|
# Import the new scraper
|
|
from scraper_v2 import AINewsScraper
|
|
|
|
def test_source(source_name: str, limit: int = 5):
|
|
"""Test a single source"""
|
|
|
|
if source_name not in config.SOURCES:
|
|
logger.error(f"❌ Unknown source: {source_name}")
|
|
logger.info(f"Available sources: {', '.join(config.SOURCES.keys())}")
|
|
return False
|
|
|
|
source_config = config.SOURCES[source_name]
|
|
|
|
logger.info(f"🧪 Testing source: {source_name}")
|
|
logger.info(f" Config: {source_config}")
|
|
logger.info(f" Limit: {limit} articles")
|
|
logger.info("")
|
|
|
|
scraper = AINewsScraper()
|
|
articles = []
|
|
|
|
try:
|
|
if source_name == 'medium':
|
|
# Test only first tag
|
|
test_config = source_config.copy()
|
|
test_config['tags'] = [source_config['tags'][0]]
|
|
test_config['articles_per_tag'] = limit
|
|
articles = scraper.scrape_medium(test_config)
|
|
elif 'url' in source_config:
|
|
test_config = source_config.copy()
|
|
test_config['articles_limit'] = limit
|
|
articles = scraper.scrape_rss_feed(source_name, test_config)
|
|
else:
|
|
logger.error(f"❌ Unknown source type")
|
|
return False
|
|
|
|
# Print results
|
|
logger.info(f"\n✅ Test completed!")
|
|
logger.info(f" Articles extracted: {len(articles)}")
|
|
logger.info(f"\n📊 Extraction stats:")
|
|
logger.info(f" newspaper3k: {scraper.stats['method_success']['newspaper']}")
|
|
logger.info(f" trafilatura: {scraper.stats['method_success']['trafilatura']}")
|
|
logger.info(f" readability: {scraper.stats['method_success']['readability']}")
|
|
logger.info(f" failed: {scraper.stats['method_success']['failed']}")
|
|
|
|
if articles:
|
|
logger.info(f"\n📰 Sample article:")
|
|
sample = articles[0]
|
|
logger.info(f" Title: {sample['title'][:80]}...")
|
|
logger.info(f" Author: {sample['author']}")
|
|
logger.info(f" URL: {sample['url']}")
|
|
logger.info(f" Content length: {len(sample['content'])} chars")
|
|
logger.info(f" Images: {len(sample.get('images', []))}")
|
|
logger.info(f" Date: {sample['published_date']}")
|
|
|
|
# Show first 200 chars of content
|
|
logger.info(f"\n Content preview:")
|
|
logger.info(f" {sample['content'][:200]}...")
|
|
|
|
success_rate = len(articles) / scraper.stats['total_attempts'] if scraper.stats['total_attempts'] > 0 else 0
|
|
|
|
logger.info(f"\n{'='*60}")
|
|
if len(articles) >= limit * 0.5: # At least 50% success
|
|
logger.info(f"✅ SUCCESS: {source_name} is working ({success_rate:.0%} success rate)")
|
|
return True
|
|
elif len(articles) > 0:
|
|
logger.info(f"⚠️ PARTIAL: {source_name} is partially working ({success_rate:.0%} success rate)")
|
|
return True
|
|
else:
|
|
logger.info(f"❌ FAILED: {source_name} is not working")
|
|
return False
|
|
|
|
except Exception as e:
|
|
logger.error(f"❌ Test failed with error: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
def test_all_sources():
|
|
"""Test all enabled sources"""
|
|
|
|
logger.info("🧪 Testing all enabled sources...\n")
|
|
|
|
results = {}
|
|
|
|
for source_name, source_config in config.SOURCES.items():
|
|
if not source_config.get('enabled', True):
|
|
logger.info(f"⏭️ Skipping {source_name} (disabled)\n")
|
|
continue
|
|
|
|
success = test_source(source_name, limit=3)
|
|
results[source_name] = success
|
|
logger.info("")
|
|
|
|
# Summary
|
|
logger.info(f"\n{'='*60}")
|
|
logger.info(f"📊 TEST SUMMARY")
|
|
logger.info(f"{'='*60}")
|
|
|
|
working = [k for k, v in results.items() if v]
|
|
broken = [k for k, v in results.items() if not v]
|
|
|
|
logger.info(f"\n✅ Working sources ({len(working)}):")
|
|
for source in working:
|
|
logger.info(f" • {source}")
|
|
|
|
if broken:
|
|
logger.info(f"\n❌ Broken sources ({len(broken)}):")
|
|
for source in broken:
|
|
logger.info(f" • {source}")
|
|
|
|
logger.info(f"\n📈 Overall: {len(working)}/{len(results)} sources working ({100*len(working)//len(results)}%)")
|
|
|
|
return results
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Test burmddit scraper sources')
|
|
parser.add_argument('--source', type=str, help='Test specific source')
|
|
parser.add_argument('--limit', type=int, default=5, help='Number of articles to test (default: 5)')
|
|
parser.add_argument('--all', action='store_true', help='Test all sources')
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Configure logger
|
|
logger.remove()
|
|
logger.add(sys.stdout, format="<level>{message}</level>", level="INFO")
|
|
|
|
if args.all:
|
|
test_all_sources()
|
|
elif args.source:
|
|
success = test_source(args.source, args.limit)
|
|
sys.exit(0 if success else 1)
|
|
else:
|
|
parser.print_help()
|
|
logger.info("\nAvailable sources:")
|
|
for source_name in config.SOURCES.keys():
|
|
enabled = "✅" if config.SOURCES[source_name].get('enabled', True) else "❌"
|
|
logger.info(f" {enabled} {source_name}")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|