#!/usr/bin/env python3 """ Test individual sources with the new scraper Usage: python3 test_scraper.py [--source SOURCE_NAME] [--limit N] """ import sys import argparse from loguru import logger import config # Import the new scraper from scraper_v2 import AINewsScraper def test_source(source_name: str, limit: int = 5): """Test a single source""" if source_name not in config.SOURCES: logger.error(f"โŒ Unknown source: {source_name}") logger.info(f"Available sources: {', '.join(config.SOURCES.keys())}") return False source_config = config.SOURCES[source_name] logger.info(f"๐Ÿงช Testing source: {source_name}") logger.info(f" Config: {source_config}") logger.info(f" Limit: {limit} articles") logger.info("") scraper = AINewsScraper() articles = [] try: if source_name == 'medium': # Test only first tag test_config = source_config.copy() test_config['tags'] = [source_config['tags'][0]] test_config['articles_per_tag'] = limit articles = scraper.scrape_medium(test_config) elif 'url' in source_config: test_config = source_config.copy() test_config['articles_limit'] = limit articles = scraper.scrape_rss_feed(source_name, test_config) else: logger.error(f"โŒ Unknown source type") return False # Print results logger.info(f"\nโœ… Test completed!") logger.info(f" Articles extracted: {len(articles)}") logger.info(f"\n๐Ÿ“Š Extraction stats:") logger.info(f" newspaper3k: {scraper.stats['method_success']['newspaper']}") logger.info(f" trafilatura: {scraper.stats['method_success']['trafilatura']}") logger.info(f" readability: {scraper.stats['method_success']['readability']}") logger.info(f" failed: {scraper.stats['method_success']['failed']}") if articles: logger.info(f"\n๐Ÿ“ฐ Sample article:") sample = articles[0] logger.info(f" Title: {sample['title'][:80]}...") logger.info(f" Author: {sample['author']}") logger.info(f" URL: {sample['url']}") logger.info(f" Content length: {len(sample['content'])} chars") logger.info(f" Images: {len(sample.get('images', []))}") logger.info(f" Date: {sample['published_date']}") # Show first 200 chars of content logger.info(f"\n Content preview:") logger.info(f" {sample['content'][:200]}...") success_rate = len(articles) / scraper.stats['total_attempts'] if scraper.stats['total_attempts'] > 0 else 0 logger.info(f"\n{'='*60}") if len(articles) >= limit * 0.5: # At least 50% success logger.info(f"โœ… SUCCESS: {source_name} is working ({success_rate:.0%} success rate)") return True elif len(articles) > 0: logger.info(f"โš ๏ธ PARTIAL: {source_name} is partially working ({success_rate:.0%} success rate)") return True else: logger.info(f"โŒ FAILED: {source_name} is not working") return False except Exception as e: logger.error(f"โŒ Test failed with error: {e}") import traceback traceback.print_exc() return False def test_all_sources(): """Test all enabled sources""" logger.info("๐Ÿงช Testing all enabled sources...\n") results = {} for source_name, source_config in config.SOURCES.items(): if not source_config.get('enabled', True): logger.info(f"โญ๏ธ Skipping {source_name} (disabled)\n") continue success = test_source(source_name, limit=3) results[source_name] = success logger.info("") # Summary logger.info(f"\n{'='*60}") logger.info(f"๐Ÿ“Š TEST SUMMARY") logger.info(f"{'='*60}") working = [k for k, v in results.items() if v] broken = [k for k, v in results.items() if not v] logger.info(f"\nโœ… Working sources ({len(working)}):") for source in working: logger.info(f" โ€ข {source}") if broken: logger.info(f"\nโŒ Broken sources ({len(broken)}):") for source in broken: logger.info(f" โ€ข {source}") logger.info(f"\n๐Ÿ“ˆ Overall: {len(working)}/{len(results)} sources working ({100*len(working)//len(results)}%)") return results def main(): parser = argparse.ArgumentParser(description='Test burmddit scraper sources') parser.add_argument('--source', type=str, help='Test specific source') parser.add_argument('--limit', type=int, default=5, help='Number of articles to test (default: 5)') parser.add_argument('--all', action='store_true', help='Test all sources') args = parser.parse_args() # Configure logger logger.remove() logger.add(sys.stdout, format="{message}", level="INFO") if args.all: test_all_sources() elif args.source: success = test_source(args.source, args.limit) sys.exit(0 if success else 1) else: parser.print_help() logger.info("\nAvailable sources:") for source_name in config.SOURCES.keys(): enabled = "โœ…" if config.SOURCES[source_name].get('enabled', True) else "โŒ" logger.info(f" {enabled} {source_name}") if __name__ == '__main__': main()