# Web scraper for AI news sources import requests from bs4 import BeautifulSoup import feedparser from newspaper import Article from datetime import datetime, timedelta from typing import List, Dict, Optional from loguru import logger import time import config import database class AINewsScraper: def __init__(self): self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (compatible; BurmdditBot/1.0; +https://burmddit.vercel.app)' }) def scrape_all_sources(self) -> int: """Scrape all enabled sources""" total_articles = 0 for source_name, source_config in config.SOURCES.items(): if not source_config.get('enabled', True): continue logger.info(f"Scraping {source_name}...") try: if source_name == 'medium': articles = self.scrape_medium(source_config) elif source_name in ['techcrunch', 'venturebeat', 'mit_tech_review']: articles = self.scrape_rss_feed(source_config) else: logger.warning(f"Unknown source: {source_name}") continue # Store articles in database for article in articles: article_id = database.insert_raw_article( url=article['url'], title=article['title'], content=article['content'], author=article['author'], published_date=article['published_date'], source=source_name, category_hint=article.get('category_hint') ) if article_id: total_articles += 1 logger.info(f"Scraped {len(articles)} articles from {source_name}") time.sleep(config.RATE_LIMITS['delay_between_requests']) except Exception as e: logger.error(f"Error scraping {source_name}: {e}") continue logger.info(f"Total articles scraped: {total_articles}") return total_articles def scrape_medium(self, source_config: Dict) -> List[Dict]: """Scrape Medium articles by tags""" articles = [] for tag in source_config['tags']: try: url = source_config['url_pattern'].format(tag=tag) response = self.session.get(url, timeout=30) soup = BeautifulSoup(response.content, 'html.parser') # Medium's structure: find article cards article_elements = soup.find_all('article', limit=source_config['articles_per_tag']) for element in article_elements: try: # Extract article URL link = element.find('a', href=True) if not link: continue article_url = link['href'] if not article_url.startswith('http'): article_url = 'https://medium.com' + article_url # Use newspaper3k for full article extraction article = self.extract_article_content(article_url) if article: article['category_hint'] = self.detect_category_from_text( article['title'] + ' ' + article['content'][:500] ) articles.append(article) except Exception as e: logger.error(f"Error parsing Medium article: {e}") continue time.sleep(2) # Rate limiting except Exception as e: logger.error(f"Error scraping Medium tag '{tag}': {e}") continue return articles def scrape_rss_feed(self, source_config: Dict) -> List[Dict]: """Scrape articles from RSS feed""" articles = [] try: feed = feedparser.parse(source_config['url']) for entry in feed.entries[:source_config.get('articles_limit', 20)]: try: # Check if AI-related (if filter enabled) if source_config.get('filter_ai') and not self.is_ai_related(entry.title + ' ' + entry.get('summary', '')): continue article_url = entry.link article = self.extract_article_content(article_url) if article: article['category_hint'] = self.detect_category_from_text( article['title'] + ' ' + article['content'][:500] ) articles.append(article) except Exception as e: logger.error(f"Error parsing RSS entry: {e}") continue except Exception as e: logger.error(f"Error fetching RSS feed: {e}") return articles def extract_article_content(self, url: str) -> Optional[Dict]: """Extract full article content using newspaper3k""" try: article = Article(url) article.download() article.parse() # Skip if article is too short if len(article.text) < 500: logger.debug(f"Article too short, skipping: {url}") return None # Parse publication date pub_date = article.publish_date if not pub_date: pub_date = datetime.now() # Skip old articles (older than 2 days) if datetime.now() - pub_date > timedelta(days=2): logger.debug(f"Article too old, skipping: {url}") return None # Extract images images = [] if article.top_image: images.append(article.top_image) # Get additional images from article for img in article.images[:config.PUBLISHING['max_images_per_article']]: if img and img not in images: images.append(img) # Extract videos (YouTube, etc.) videos = [] if article.movies: videos = list(article.movies) # Also check for YouTube embeds in HTML try: from bs4 import BeautifulSoup soup = BeautifulSoup(article.html, 'html.parser') # Find YouTube iframes for iframe in soup.find_all('iframe'): src = iframe.get('src', '') if 'youtube.com' in src or 'youtu.be' in src: videos.append(src) # Find more images for img in soup.find_all('img')[:10]: img_src = img.get('src', '') if img_src and img_src not in images and len(images) < config.PUBLISHING['max_images_per_article']: # Filter out tiny images (likely icons/ads) width = img.get('width', 0) if not width or (isinstance(width, str) and not width.isdigit()) or int(str(width)) > 200: images.append(img_src) except Exception as e: logger.debug(f"Error extracting additional media: {e}") return { 'url': url, 'title': article.title or 'Untitled', 'content': article.text, 'author': ', '.join(article.authors) if article.authors else 'Unknown', 'published_date': pub_date, 'top_image': article.top_image, 'images': images, # 🔥 Multiple images! 'videos': videos # 🔥 Video embeds! } except Exception as e: logger.error(f"Error extracting article from {url}: {e}") return None def is_ai_related(self, text: str) -> bool: """Check if text is AI-related""" ai_keywords = [ 'artificial intelligence', 'ai', 'machine learning', 'ml', 'deep learning', 'neural network', 'chatgpt', 'gpt', 'llm', 'claude', 'openai', 'anthropic', 'transformer', 'nlp', 'generative ai', 'automation', 'computer vision' ] text_lower = text.lower() return any(keyword in text_lower for keyword in ai_keywords) def detect_category_from_text(self, text: str) -> Optional[str]: """Detect category hint from text""" text_lower = text.lower() scores = {} for category, keywords in config.CATEGORY_KEYWORDS.items(): score = sum(1 for keyword in keywords if keyword in text_lower) scores[category] = score if max(scores.values()) > 0: return max(scores, key=scores.get) return None def run_scraper(): """Main scraper execution function""" logger.info("Starting scraper...") start_time = time.time() try: scraper = AINewsScraper() articles_count = scraper.scrape_all_sources() duration = int(time.time() - start_time) database.log_pipeline_stage( stage='crawl', status='completed', articles_processed=articles_count, duration=duration ) logger.info(f"Scraper completed in {duration}s. Articles scraped: {articles_count}") return articles_count except Exception as e: logger.error(f"Scraper failed: {e}") database.log_pipeline_stage( stage='crawl', status='failed', error_message=str(e) ) return 0 if __name__ == '__main__': from loguru import logger logger.add(config.LOG_FILE, rotation="1 day") run_scraper()