Files
burmddit/backend/test_scraper.py
Zeya Phyo f51ac4afa4 Add web admin features + fix scraper & translator
Frontend changes:
- Add /admin dashboard for article management
- Add AdminButton component (Alt+Shift+A on articles)
- Add /api/admin/article API endpoints

Backend improvements:
- scraper_v2.py: Multi-layer fallback extraction (newspaper → trafilatura → readability)
- translator_v2.py: Better chunking, repetition detection, validation
- admin_tools.py: CLI admin commands
- test_scraper.py: Individual source testing

Docs:
- WEB-ADMIN-GUIDE.md: Web admin usage
- ADMIN-GUIDE.md: CLI admin usage
- SCRAPER-IMPROVEMENT-PLAN.md: Scraper fixes details
- TRANSLATION-FIX.md: Translation improvements
- ADMIN-FEATURES-SUMMARY.md: Implementation summary

Fixes:
- Article scraping from 0 → 96+ articles working
- Translation quality issues (repetition, truncation)
- Added 13 new RSS sources
2026-02-26 09:17:50 +00:00

153 lines
5.5 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Test individual sources with the new scraper
Usage: python3 test_scraper.py [--source SOURCE_NAME] [--limit N]
"""
import sys
import argparse
from loguru import logger
import config
# Import the new scraper
from scraper_v2 import AINewsScraper
def test_source(source_name: str, limit: int = 5):
"""Test a single source"""
if source_name not in config.SOURCES:
logger.error(f"❌ Unknown source: {source_name}")
logger.info(f"Available sources: {', '.join(config.SOURCES.keys())}")
return False
source_config = config.SOURCES[source_name]
logger.info(f"🧪 Testing source: {source_name}")
logger.info(f" Config: {source_config}")
logger.info(f" Limit: {limit} articles")
logger.info("")
scraper = AINewsScraper()
articles = []
try:
if source_name == 'medium':
# Test only first tag
test_config = source_config.copy()
test_config['tags'] = [source_config['tags'][0]]
test_config['articles_per_tag'] = limit
articles = scraper.scrape_medium(test_config)
elif 'url' in source_config:
test_config = source_config.copy()
test_config['articles_limit'] = limit
articles = scraper.scrape_rss_feed(source_name, test_config)
else:
logger.error(f"❌ Unknown source type")
return False
# Print results
logger.info(f"\n✅ Test completed!")
logger.info(f" Articles extracted: {len(articles)}")
logger.info(f"\n📊 Extraction stats:")
logger.info(f" newspaper3k: {scraper.stats['method_success']['newspaper']}")
logger.info(f" trafilatura: {scraper.stats['method_success']['trafilatura']}")
logger.info(f" readability: {scraper.stats['method_success']['readability']}")
logger.info(f" failed: {scraper.stats['method_success']['failed']}")
if articles:
logger.info(f"\n📰 Sample article:")
sample = articles[0]
logger.info(f" Title: {sample['title'][:80]}...")
logger.info(f" Author: {sample['author']}")
logger.info(f" URL: {sample['url']}")
logger.info(f" Content length: {len(sample['content'])} chars")
logger.info(f" Images: {len(sample.get('images', []))}")
logger.info(f" Date: {sample['published_date']}")
# Show first 200 chars of content
logger.info(f"\n Content preview:")
logger.info(f" {sample['content'][:200]}...")
success_rate = len(articles) / scraper.stats['total_attempts'] if scraper.stats['total_attempts'] > 0 else 0
logger.info(f"\n{'='*60}")
if len(articles) >= limit * 0.5: # At least 50% success
logger.info(f"✅ SUCCESS: {source_name} is working ({success_rate:.0%} success rate)")
return True
elif len(articles) > 0:
logger.info(f"⚠️ PARTIAL: {source_name} is partially working ({success_rate:.0%} success rate)")
return True
else:
logger.info(f"❌ FAILED: {source_name} is not working")
return False
except Exception as e:
logger.error(f"❌ Test failed with error: {e}")
import traceback
traceback.print_exc()
return False
def test_all_sources():
"""Test all enabled sources"""
logger.info("🧪 Testing all enabled sources...\n")
results = {}
for source_name, source_config in config.SOURCES.items():
if not source_config.get('enabled', True):
logger.info(f"⏭️ Skipping {source_name} (disabled)\n")
continue
success = test_source(source_name, limit=3)
results[source_name] = success
logger.info("")
# Summary
logger.info(f"\n{'='*60}")
logger.info(f"📊 TEST SUMMARY")
logger.info(f"{'='*60}")
working = [k for k, v in results.items() if v]
broken = [k for k, v in results.items() if not v]
logger.info(f"\n✅ Working sources ({len(working)}):")
for source in working:
logger.info(f"{source}")
if broken:
logger.info(f"\n❌ Broken sources ({len(broken)}):")
for source in broken:
logger.info(f"{source}")
logger.info(f"\n📈 Overall: {len(working)}/{len(results)} sources working ({100*len(working)//len(results)}%)")
return results
def main():
parser = argparse.ArgumentParser(description='Test burmddit scraper sources')
parser.add_argument('--source', type=str, help='Test specific source')
parser.add_argument('--limit', type=int, default=5, help='Number of articles to test (default: 5)')
parser.add_argument('--all', action='store_true', help='Test all sources')
args = parser.parse_args()
# Configure logger
logger.remove()
logger.add(sys.stdout, format="<level>{message}</level>", level="INFO")
if args.all:
test_all_sources()
elif args.source:
success = test_source(args.source, args.limit)
sys.exit(0 if success else 1)
else:
parser.print_help()
logger.info("\nAvailable sources:")
for source_name in config.SOURCES.keys():
enabled = "" if config.SOURCES[source_name].get('enabled', True) else ""
logger.info(f" {enabled} {source_name}")
if __name__ == '__main__':
main()