burmddit/backend/test_scraper.py

#!/usr/bin/env python3
"""
Test individual sources with the new scraper
Usage: python3 test_scraper.py [--source SOURCE_NAME] [--limit N]
"""

import sys
import argparse
from loguru import logger
import config

# Import the new scraper
from scraper_v2 import AINewsScraper

def test_source(source_name: str, limit: int = 5):
    """Test a single source"""

    if source_name not in config.SOURCES:
        logger.error(f"❌ Unknown source: {source_name}")
        logger.info(f"Available sources: {', '.join(config.SOURCES.keys())}")
        return False

    source_config = config.SOURCES[source_name]

    logger.info(f"🧪 Testing source: {source_name}")
    logger.info(f"   Config: {source_config}")
    logger.info(f"   Limit: {limit} articles")
    logger.info("")

    scraper = AINewsScraper()
    articles = []

    try:
        if source_name == 'medium':
            # Test only first tag
            test_config = source_config.copy()
            test_config['tags'] = [source_config['tags'][0]]
            test_config['articles_per_tag'] = limit
            articles = scraper.scrape_medium(test_config)
        elif 'url' in source_config:
            test_config = source_config.copy()
            test_config['articles_limit'] = limit
            articles = scraper.scrape_rss_feed(source_name, test_config)
        else:
            logger.error(f"❌ Unknown source type")
            return False

        # Print results
        logger.info(f"\n✅ Test completed!")
        logger.info(f"   Articles extracted: {len(articles)}")
        logger.info(f"\n📊 Extraction stats:")
        logger.info(f"   newspaper3k: {scraper.stats['method_success']['newspaper']}")
        logger.info(f"   trafilatura: {scraper.stats['method_success']['trafilatura']}")
        logger.info(f"   readability: {scraper.stats['method_success']['readability']}")
        logger.info(f"   failed: {scraper.stats['method_success']['failed']}")

        if articles:
            logger.info(f"\n📰 Sample article:")
            sample = articles[0]
            logger.info(f"   Title: {sample['title'][:80]}...")
            logger.info(f"   Author: {sample['author']}")
            logger.info(f"   URL: {sample['url']}")
            logger.info(f"   Content length: {len(sample['content'])} chars")
            logger.info(f"   Images: {len(sample.get('images', []))}")
            logger.info(f"   Date: {sample['published_date']}")

            # Show first 200 chars of content
            logger.info(f"\n   Content preview:")
            logger.info(f"   {sample['content'][:200]}...")

        success_rate = len(articles) / scraper.stats['total_attempts'] if scraper.stats['total_attempts'] > 0 else 0

        logger.info(f"\n{'='*60}")
        if len(articles) >= limit * 0.5:  # At least 50% success
            logger.info(f"✅ SUCCESS: {source_name} is working ({success_rate:.0%} success rate)")
            return True
        elif len(articles) > 0:
            logger.info(f"⚠️  PARTIAL: {source_name} is partially working ({success_rate:.0%} success rate)")
            return True
        else:
            logger.info(f"❌ FAILED: {source_name} is not working")
            return False

    except Exception as e:
        logger.error(f"❌ Test failed with error: {e}")
        import traceback
        traceback.print_exc()
        return False

def test_all_sources():
    """Test all enabled sources"""

    logger.info("🧪 Testing all enabled sources...\n")

    results = {}

    for source_name, source_config in config.SOURCES.items():
        if not source_config.get('enabled', True):
            logger.info(f"⏭️  Skipping {source_name} (disabled)\n")
            continue

        success = test_source(source_name, limit=3)
        results[source_name] = success
        logger.info("")

    # Summary
    logger.info(f"\n{'='*60}")
    logger.info(f"📊 TEST SUMMARY")
    logger.info(f"{'='*60}")

    working = [k for k, v in results.items() if v]
    broken = [k for k, v in results.items() if not v]

    logger.info(f"\n✅ Working sources ({len(working)}):")
    for source in working:
        logger.info(f"   • {source}")

    if broken:
        logger.info(f"\n❌ Broken sources ({len(broken)}):")
        for source in broken:
            logger.info(f"   • {source}")

    logger.info(f"\n📈 Overall: {len(working)}/{len(results)} sources working ({100*len(working)//len(results)}%)")

    return results

def main():
    parser = argparse.ArgumentParser(description='Test burmddit scraper sources')
    parser.add_argument('--source', type=str, help='Test specific source')
    parser.add_argument('--limit', type=int, default=5, help='Number of articles to test (default: 5)')
    parser.add_argument('--all', action='store_true', help='Test all sources')

    args = parser.parse_args()

    # Configure logger
    logger.remove()
    logger.add(sys.stdout, format="<level>{message}</level>", level="INFO")

    if args.all:
        test_all_sources()
    elif args.source:
        success = test_source(args.source, args.limit)
        sys.exit(0 if success else 1)
    else:
        parser.print_help()
        logger.info("\nAvailable sources:")
        for source_name in config.SOURCES.keys():
            enabled = "✅" if config.SOURCES[source_name].get('enabled', True) else "❌"
            logger.info(f"  {enabled} {source_name}")

if __name__ == '__main__':
    main()