forked from minzeyaphyo/burmddit
272 lines
10 KiB
Python
272 lines
10 KiB
Python
# Web scraper for AI news sources
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import feedparser
|
|
from newspaper import Article
|
|
from datetime import datetime, timedelta
|
|
from typing import List, Dict, Optional
|
|
from loguru import logger
|
|
import time
|
|
import config
|
|
import database
|
|
|
|
class AINewsScraper:
|
|
def __init__(self):
|
|
self.session = requests.Session()
|
|
self.session.headers.update({
|
|
'User-Agent': 'Mozilla/5.0 (compatible; BurmdditBot/1.0; +https://burmddit.vercel.app)'
|
|
})
|
|
|
|
def scrape_all_sources(self) -> int:
|
|
"""Scrape all enabled sources"""
|
|
total_articles = 0
|
|
|
|
for source_name, source_config in config.SOURCES.items():
|
|
if not source_config.get('enabled', True):
|
|
continue
|
|
|
|
logger.info(f"Scraping {source_name}...")
|
|
|
|
try:
|
|
if source_name == 'medium':
|
|
articles = self.scrape_medium(source_config)
|
|
elif 'url' in source_config:
|
|
articles = self.scrape_rss_feed(source_config)
|
|
else:
|
|
logger.warning(f"Unknown source: {source_name}")
|
|
continue
|
|
|
|
# Store articles in database
|
|
for article in articles:
|
|
article_id = database.insert_raw_article(
|
|
url=article['url'],
|
|
title=article['title'],
|
|
content=article['content'],
|
|
author=article['author'],
|
|
published_date=article['published_date'],
|
|
source=source_name,
|
|
category_hint=article.get('category_hint')
|
|
)
|
|
if article_id:
|
|
total_articles += 1
|
|
|
|
logger.info(f"Scraped {len(articles)} articles from {source_name}")
|
|
time.sleep(config.RATE_LIMITS['delay_between_requests'])
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error scraping {source_name}: {e}")
|
|
continue
|
|
|
|
logger.info(f"Total articles scraped: {total_articles}")
|
|
return total_articles
|
|
|
|
def scrape_medium(self, source_config: Dict) -> List[Dict]:
|
|
"""Scrape Medium articles by tags"""
|
|
articles = []
|
|
|
|
for tag in source_config['tags']:
|
|
try:
|
|
url = source_config['url_pattern'].format(tag=tag)
|
|
response = self.session.get(url, timeout=30)
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
|
# Medium's structure: find article cards
|
|
article_elements = soup.find_all('article', limit=source_config['articles_per_tag'])
|
|
|
|
for element in article_elements:
|
|
try:
|
|
# Extract article URL
|
|
link = element.find('a', href=True)
|
|
if not link:
|
|
continue
|
|
|
|
article_url = link['href']
|
|
if not article_url.startswith('http'):
|
|
article_url = 'https://medium.com' + article_url
|
|
|
|
# Use newspaper3k for full article extraction
|
|
article = self.extract_article_content(article_url)
|
|
if article:
|
|
article['category_hint'] = self.detect_category_from_text(
|
|
article['title'] + ' ' + article['content'][:500]
|
|
)
|
|
articles.append(article)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error parsing Medium article: {e}")
|
|
continue
|
|
|
|
time.sleep(2) # Rate limiting
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error scraping Medium tag '{tag}': {e}")
|
|
continue
|
|
|
|
return articles
|
|
|
|
def scrape_rss_feed(self, source_config: Dict) -> List[Dict]:
|
|
"""Scrape articles from RSS feed"""
|
|
articles = []
|
|
|
|
try:
|
|
feed = feedparser.parse(source_config['url'])
|
|
|
|
for entry in feed.entries[:source_config.get('articles_limit', 20)]:
|
|
try:
|
|
# Check if AI-related (if filter enabled)
|
|
if source_config.get('filter_ai') and not self.is_ai_related(entry.title + ' ' + entry.get('summary', '')):
|
|
continue
|
|
|
|
article_url = entry.link
|
|
article = self.extract_article_content(article_url)
|
|
|
|
if article:
|
|
article['category_hint'] = self.detect_category_from_text(
|
|
article['title'] + ' ' + article['content'][:500]
|
|
)
|
|
articles.append(article)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error parsing RSS entry: {e}")
|
|
continue
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error fetching RSS feed: {e}")
|
|
|
|
return articles
|
|
|
|
def extract_article_content(self, url: str) -> Optional[Dict]:
|
|
"""Extract full article content using newspaper3k"""
|
|
try:
|
|
article = Article(url)
|
|
article.download()
|
|
article.parse()
|
|
|
|
# Skip if article is too short
|
|
if len(article.text) < 500:
|
|
logger.debug(f"Article too short, skipping: {url}")
|
|
return None
|
|
|
|
# Parse publication date
|
|
pub_date = article.publish_date
|
|
if not pub_date:
|
|
pub_date = datetime.now()
|
|
|
|
# Skip old articles (older than 2 days)
|
|
if datetime.now() - pub_date > timedelta(days=2):
|
|
logger.debug(f"Article too old, skipping: {url}")
|
|
return None
|
|
|
|
# Extract images
|
|
images = []
|
|
if article.top_image:
|
|
images.append(article.top_image)
|
|
|
|
# Get additional images from article
|
|
for img in article.images[:config.PUBLISHING['max_images_per_article']]:
|
|
if img and img not in images:
|
|
images.append(img)
|
|
|
|
# Extract videos (YouTube, etc.)
|
|
videos = []
|
|
if article.movies:
|
|
videos = list(article.movies)
|
|
|
|
# Also check for YouTube embeds in HTML
|
|
try:
|
|
from bs4 import BeautifulSoup
|
|
soup = BeautifulSoup(article.html, 'html.parser')
|
|
|
|
# Find YouTube iframes
|
|
for iframe in soup.find_all('iframe'):
|
|
src = iframe.get('src', '')
|
|
if 'youtube.com' in src or 'youtu.be' in src:
|
|
videos.append(src)
|
|
|
|
# Find more images
|
|
for img in soup.find_all('img')[:10]:
|
|
img_src = img.get('src', '')
|
|
if img_src and img_src not in images and len(images) < config.PUBLISHING['max_images_per_article']:
|
|
# Filter out tiny images (likely icons/ads)
|
|
width = img.get('width', 0)
|
|
if not width or (isinstance(width, str) and not width.isdigit()) or int(str(width)) > 200:
|
|
images.append(img_src)
|
|
except Exception as e:
|
|
logger.debug(f"Error extracting additional media: {e}")
|
|
|
|
return {
|
|
'url': url,
|
|
'title': article.title or 'Untitled',
|
|
'content': article.text,
|
|
'author': ', '.join(article.authors) if article.authors else 'Unknown',
|
|
'published_date': pub_date,
|
|
'top_image': article.top_image,
|
|
'images': images, # 🔥 Multiple images!
|
|
'videos': videos # 🔥 Video embeds!
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error extracting article from {url}: {e}")
|
|
return None
|
|
|
|
def is_ai_related(self, text: str) -> bool:
|
|
"""Check if text is AI-related"""
|
|
ai_keywords = [
|
|
'artificial intelligence', 'ai', 'machine learning', 'ml',
|
|
'deep learning', 'neural network', 'chatgpt', 'gpt', 'llm',
|
|
'claude', 'openai', 'anthropic', 'transformer', 'nlp',
|
|
'generative ai', 'automation', 'computer vision'
|
|
]
|
|
|
|
text_lower = text.lower()
|
|
return any(keyword in text_lower for keyword in ai_keywords)
|
|
|
|
def detect_category_from_text(self, text: str) -> Optional[str]:
|
|
"""Detect category hint from text"""
|
|
text_lower = text.lower()
|
|
scores = {}
|
|
|
|
for category, keywords in config.CATEGORY_KEYWORDS.items():
|
|
score = sum(1 for keyword in keywords if keyword in text_lower)
|
|
scores[category] = score
|
|
|
|
if max(scores.values()) > 0:
|
|
return max(scores, key=scores.get)
|
|
|
|
return None
|
|
|
|
def run_scraper():
|
|
"""Main scraper execution function"""
|
|
logger.info("Starting scraper...")
|
|
start_time = time.time()
|
|
|
|
try:
|
|
scraper = AINewsScraper()
|
|
articles_count = scraper.scrape_all_sources()
|
|
|
|
duration = int(time.time() - start_time)
|
|
database.log_pipeline_stage(
|
|
stage='crawl',
|
|
status='completed',
|
|
articles_processed=articles_count,
|
|
duration=duration
|
|
)
|
|
|
|
logger.info(f"Scraper completed in {duration}s. Articles scraped: {articles_count}")
|
|
return articles_count
|
|
|
|
except Exception as e:
|
|
logger.error(f"Scraper failed: {e}")
|
|
database.log_pipeline_stage(
|
|
stage='crawl',
|
|
status='failed',
|
|
error_message=str(e)
|
|
)
|
|
return 0
|
|
|
|
if __name__ == '__main__':
|
|
from loguru import logger
|
|
logger.add(config.LOG_FILE, rotation="1 day")
|
|
run_scraper()
|