burmddit/backend/scraper.py

# Web scraper for AI news sources

import requests
from bs4 import BeautifulSoup
import feedparser
from newspaper import Article
from datetime import datetime, timedelta
from typing import List, Dict, Optional
from loguru import logger
import time
import config
import database

class AINewsScraper:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (compatible; BurmdditBot/1.0; +https://burmddit.vercel.app)'
        })

    def scrape_all_sources(self) -> int:
        """Scrape all enabled sources"""
        total_articles = 0

        for source_name, source_config in config.SOURCES.items():
            if not source_config.get('enabled', True):
                continue

            logger.info(f"Scraping {source_name}...")

            try:
                if source_name == 'medium':
                    articles = self.scrape_medium(source_config)
                elif source_name in ['techcrunch', 'venturebeat', 'mit_tech_review']:
                    articles = self.scrape_rss_feed(source_config)
                else:
                    logger.warning(f"Unknown source: {source_name}")
                    continue

                # Store articles in database
                for article in articles:
                    article_id = database.insert_raw_article(
                        url=article['url'],
                        title=article['title'],
                        content=article['content'],
                        author=article['author'],
                        published_date=article['published_date'],
                        source=source_name,
                        category_hint=article.get('category_hint')
                    )
                    if article_id:
                        total_articles += 1

                logger.info(f"Scraped {len(articles)} articles from {source_name}")
                time.sleep(config.RATE_LIMITS['delay_between_requests'])

            except Exception as e:
                logger.error(f"Error scraping {source_name}: {e}")
                continue

        logger.info(f"Total articles scraped: {total_articles}")
        return total_articles

    def scrape_medium(self, source_config: Dict) -> List[Dict]:
        """Scrape Medium articles by tags"""
        articles = []

        for tag in source_config['tags']:
            try:
                url = source_config['url_pattern'].format(tag=tag)
                response = self.session.get(url, timeout=30)
                soup = BeautifulSoup(response.content, 'html.parser')

                # Medium's structure: find article cards
                article_elements = soup.find_all('article', limit=source_config['articles_per_tag'])

                for element in article_elements:
                    try:
                        # Extract article URL
                        link = element.find('a', href=True)
                        if not link:
                            continue

                        article_url = link['href']
                        if not article_url.startswith('http'):
                            article_url = 'https://medium.com' + article_url

                        # Use newspaper3k for full article extraction
                        article = self.extract_article_content(article_url)
                        if article:
                            article['category_hint'] = self.detect_category_from_text(
                                article['title'] + ' ' + article['content'][:500]
                            )
                            articles.append(article)

                    except Exception as e:
                        logger.error(f"Error parsing Medium article: {e}")
                        continue

                time.sleep(2)  # Rate limiting

            except Exception as e:
                logger.error(f"Error scraping Medium tag '{tag}': {e}")
                continue

        return articles

    def scrape_rss_feed(self, source_config: Dict) -> List[Dict]:
        """Scrape articles from RSS feed"""
        articles = []

        try:
            feed = feedparser.parse(source_config['url'])

            for entry in feed.entries[:source_config.get('articles_limit', 20)]:
                try:
                    # Check if AI-related (if filter enabled)
                    if source_config.get('filter_ai') and not self.is_ai_related(entry.title + ' ' + entry.get('summary', '')):
                        continue

                    article_url = entry.link
                    article = self.extract_article_content(article_url)

                    if article:
                        article['category_hint'] = self.detect_category_from_text(
                            article['title'] + ' ' + article['content'][:500]
                        )
                        articles.append(article)

                except Exception as e:
                    logger.error(f"Error parsing RSS entry: {e}")
                    continue

        except Exception as e:
            logger.error(f"Error fetching RSS feed: {e}")

        return articles

    def extract_article_content(self, url: str) -> Optional[Dict]:
        """Extract full article content using newspaper3k"""
        try:
            article = Article(url)
            article.download()
            article.parse()

            # Skip if article is too short
            if len(article.text) < 500:
                logger.debug(f"Article too short, skipping: {url}")
                return None

            # Parse publication date
            pub_date = article.publish_date
            if not pub_date:
                pub_date = datetime.now()

            # Skip old articles (older than 2 days)
            if datetime.now() - pub_date > timedelta(days=2):
                logger.debug(f"Article too old, skipping: {url}")
                return None

            # Extract images
            images = []
            if article.top_image:
                images.append(article.top_image)

            # Get additional images from article
            for img in article.images[:config.PUBLISHING['max_images_per_article']]:
                if img and img not in images:
                    images.append(img)

            # Extract videos (YouTube, etc.)
            videos = []
            if article.movies:
                videos = list(article.movies)

            # Also check for YouTube embeds in HTML
            try:
                from bs4 import BeautifulSoup
                soup = BeautifulSoup(article.html, 'html.parser')

                # Find YouTube iframes
                for iframe in soup.find_all('iframe'):
                    src = iframe.get('src', '')
                    if 'youtube.com' in src or 'youtu.be' in src:
                        videos.append(src)

                # Find more images
                for img in soup.find_all('img')[:10]:
                    img_src = img.get('src', '')
                    if img_src and img_src not in images and len(images) < config.PUBLISHING['max_images_per_article']:
                        # Filter out tiny images (likely icons/ads)
                        width = img.get('width', 0)
                        if not width or (isinstance(width, str) and not width.isdigit()) or int(str(width)) > 200:
                            images.append(img_src)
            except Exception as e:
                logger.debug(f"Error extracting additional media: {e}")

            return {
                'url': url,
                'title': article.title or 'Untitled',
                'content': article.text,
                'author': ', '.join(article.authors) if article.authors else 'Unknown',
                'published_date': pub_date,
                'top_image': article.top_image,
                'images': images,  # 🔥 Multiple images!
                'videos': videos   # 🔥 Video embeds!
            }

        except Exception as e:
            logger.error(f"Error extracting article from {url}: {e}")
            return None

    def is_ai_related(self, text: str) -> bool:
        """Check if text is AI-related"""
        ai_keywords = [
            'artificial intelligence', 'ai', 'machine learning', 'ml',
            'deep learning', 'neural network', 'chatgpt', 'gpt', 'llm',
            'claude', 'openai', 'anthropic', 'transformer', 'nlp',
            'generative ai', 'automation', 'computer vision'
        ]

        text_lower = text.lower()
        return any(keyword in text_lower for keyword in ai_keywords)

    def detect_category_from_text(self, text: str) -> Optional[str]:
        """Detect category hint from text"""
        text_lower = text.lower()
        scores = {}

        for category, keywords in config.CATEGORY_KEYWORDS.items():
            score = sum(1 for keyword in keywords if keyword in text_lower)
            scores[category] = score

        if max(scores.values()) > 0:
            return max(scores, key=scores.get)

        return None

def run_scraper():
    """Main scraper execution function"""
    logger.info("Starting scraper...")
    start_time = time.time()

    try:
        scraper = AINewsScraper()
        articles_count = scraper.scrape_all_sources()

        duration = int(time.time() - start_time)
        database.log_pipeline_stage(
            stage='crawl',
            status='completed',
            articles_processed=articles_count,
            duration=duration
        )

        logger.info(f"Scraper completed in {duration}s. Articles scraped: {articles_count}")
        return articles_count

    except Exception as e:
        logger.error(f"Scraper failed: {e}")
        database.log_pipeline_stage(
            stage='crawl',
            status='failed',
            error_message=str(e)
        )
        return 0

if __name__ == '__main__':
    from loguru import logger
    logger.add(config.LOG_FILE, rotation="1 day")
    run_scraper()