Fix scraper: use newspaper4k, handle all RSS sources
This commit is contained in:
@@ -4,7 +4,7 @@
|
|||||||
beautifulsoup4==4.12.3
|
beautifulsoup4==4.12.3
|
||||||
requests==2.31.0
|
requests==2.31.0
|
||||||
feedparser==6.0.11
|
feedparser==6.0.11
|
||||||
newspaper3k==0.2.8
|
newspaper4k>=0.9.3
|
||||||
lxml_html_clean
|
lxml_html_clean
|
||||||
|
|
||||||
# Database
|
# Database
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ class AINewsScraper:
|
|||||||
try:
|
try:
|
||||||
if source_name == 'medium':
|
if source_name == 'medium':
|
||||||
articles = self.scrape_medium(source_config)
|
articles = self.scrape_medium(source_config)
|
||||||
elif source_name in ['techcrunch', 'venturebeat', 'mit_tech_review']:
|
elif 'url' in source_config:
|
||||||
articles = self.scrape_rss_feed(source_config)
|
articles = self.scrape_rss_feed(source_config)
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Unknown source: {source_name}")
|
logger.warning(f"Unknown source: {source_name}")
|
||||||
|
|||||||
Reference in New Issue
Block a user