Files
burmddit/backend/scraper.py

272 lines
10 KiB
Python

# Web scraper for AI news sources
import requests
from bs4 import BeautifulSoup
import feedparser
from newspaper import Article
from datetime import datetime, timedelta
from typing import List, Dict, Optional
from loguru import logger
import time
import config
import database
class AINewsScraper:
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (compatible; BurmdditBot/1.0; +https://burmddit.vercel.app)'
})
def scrape_all_sources(self) -> int:
"""Scrape all enabled sources"""
total_articles = 0
for source_name, source_config in config.SOURCES.items():
if not source_config.get('enabled', True):
continue
logger.info(f"Scraping {source_name}...")
try:
if source_name == 'medium':
articles = self.scrape_medium(source_config)
elif source_name in ['techcrunch', 'venturebeat', 'mit_tech_review']:
articles = self.scrape_rss_feed(source_config)
else:
logger.warning(f"Unknown source: {source_name}")
continue
# Store articles in database
for article in articles:
article_id = database.insert_raw_article(
url=article['url'],
title=article['title'],
content=article['content'],
author=article['author'],
published_date=article['published_date'],
source=source_name,
category_hint=article.get('category_hint')
)
if article_id:
total_articles += 1
logger.info(f"Scraped {len(articles)} articles from {source_name}")
time.sleep(config.RATE_LIMITS['delay_between_requests'])
except Exception as e:
logger.error(f"Error scraping {source_name}: {e}")
continue
logger.info(f"Total articles scraped: {total_articles}")
return total_articles
def scrape_medium(self, source_config: Dict) -> List[Dict]:
"""Scrape Medium articles by tags"""
articles = []
for tag in source_config['tags']:
try:
url = source_config['url_pattern'].format(tag=tag)
response = self.session.get(url, timeout=30)
soup = BeautifulSoup(response.content, 'html.parser')
# Medium's structure: find article cards
article_elements = soup.find_all('article', limit=source_config['articles_per_tag'])
for element in article_elements:
try:
# Extract article URL
link = element.find('a', href=True)
if not link:
continue
article_url = link['href']
if not article_url.startswith('http'):
article_url = 'https://medium.com' + article_url
# Use newspaper3k for full article extraction
article = self.extract_article_content(article_url)
if article:
article['category_hint'] = self.detect_category_from_text(
article['title'] + ' ' + article['content'][:500]
)
articles.append(article)
except Exception as e:
logger.error(f"Error parsing Medium article: {e}")
continue
time.sleep(2) # Rate limiting
except Exception as e:
logger.error(f"Error scraping Medium tag '{tag}': {e}")
continue
return articles
def scrape_rss_feed(self, source_config: Dict) -> List[Dict]:
"""Scrape articles from RSS feed"""
articles = []
try:
feed = feedparser.parse(source_config['url'])
for entry in feed.entries[:source_config.get('articles_limit', 20)]:
try:
# Check if AI-related (if filter enabled)
if source_config.get('filter_ai') and not self.is_ai_related(entry.title + ' ' + entry.get('summary', '')):
continue
article_url = entry.link
article = self.extract_article_content(article_url)
if article:
article['category_hint'] = self.detect_category_from_text(
article['title'] + ' ' + article['content'][:500]
)
articles.append(article)
except Exception as e:
logger.error(f"Error parsing RSS entry: {e}")
continue
except Exception as e:
logger.error(f"Error fetching RSS feed: {e}")
return articles
def extract_article_content(self, url: str) -> Optional[Dict]:
"""Extract full article content using newspaper3k"""
try:
article = Article(url)
article.download()
article.parse()
# Skip if article is too short
if len(article.text) < 500:
logger.debug(f"Article too short, skipping: {url}")
return None
# Parse publication date
pub_date = article.publish_date
if not pub_date:
pub_date = datetime.now()
# Skip old articles (older than 2 days)
if datetime.now() - pub_date > timedelta(days=2):
logger.debug(f"Article too old, skipping: {url}")
return None
# Extract images
images = []
if article.top_image:
images.append(article.top_image)
# Get additional images from article
for img in article.images[:config.PUBLISHING['max_images_per_article']]:
if img and img not in images:
images.append(img)
# Extract videos (YouTube, etc.)
videos = []
if article.movies:
videos = list(article.movies)
# Also check for YouTube embeds in HTML
try:
from bs4 import BeautifulSoup
soup = BeautifulSoup(article.html, 'html.parser')
# Find YouTube iframes
for iframe in soup.find_all('iframe'):
src = iframe.get('src', '')
if 'youtube.com' in src or 'youtu.be' in src:
videos.append(src)
# Find more images
for img in soup.find_all('img')[:10]:
img_src = img.get('src', '')
if img_src and img_src not in images and len(images) < config.PUBLISHING['max_images_per_article']:
# Filter out tiny images (likely icons/ads)
width = img.get('width', 0)
if not width or (isinstance(width, str) and not width.isdigit()) or int(str(width)) > 200:
images.append(img_src)
except Exception as e:
logger.debug(f"Error extracting additional media: {e}")
return {
'url': url,
'title': article.title or 'Untitled',
'content': article.text,
'author': ', '.join(article.authors) if article.authors else 'Unknown',
'published_date': pub_date,
'top_image': article.top_image,
'images': images, # 🔥 Multiple images!
'videos': videos # 🔥 Video embeds!
}
except Exception as e:
logger.error(f"Error extracting article from {url}: {e}")
return None
def is_ai_related(self, text: str) -> bool:
"""Check if text is AI-related"""
ai_keywords = [
'artificial intelligence', 'ai', 'machine learning', 'ml',
'deep learning', 'neural network', 'chatgpt', 'gpt', 'llm',
'claude', 'openai', 'anthropic', 'transformer', 'nlp',
'generative ai', 'automation', 'computer vision'
]
text_lower = text.lower()
return any(keyword in text_lower for keyword in ai_keywords)
def detect_category_from_text(self, text: str) -> Optional[str]:
"""Detect category hint from text"""
text_lower = text.lower()
scores = {}
for category, keywords in config.CATEGORY_KEYWORDS.items():
score = sum(1 for keyword in keywords if keyword in text_lower)
scores[category] = score
if max(scores.values()) > 0:
return max(scores, key=scores.get)
return None
def run_scraper():
"""Main scraper execution function"""
logger.info("Starting scraper...")
start_time = time.time()
try:
scraper = AINewsScraper()
articles_count = scraper.scrape_all_sources()
duration = int(time.time() - start_time)
database.log_pipeline_stage(
stage='crawl',
status='completed',
articles_processed=articles_count,
duration=duration
)
logger.info(f"Scraper completed in {duration}s. Articles scraped: {articles_count}")
return articles_count
except Exception as e:
logger.error(f"Scraper failed: {e}")
database.log_pipeline_stage(
stage='crawl',
status='failed',
error_message=str(e)
)
return 0
if __name__ == '__main__':
from loguru import logger
logger.add(config.LOG_FILE, rotation="1 day")
run_scraper()