Compare commits

...

7 Commits

Author SHA1 Message Date
Min Zeya Phyo
afa8fb8d78 Add 'use client' to ArticleCard for onClick handler 2026-02-19 21:20:19 +08:00
Min Zeya Phyo
4829f15010 Use claude-3-haiku model (configurable via CLAUDE_MODEL env) 2026-02-19 20:16:01 +08:00
Min Zeya Phyo
4ab83ba420 Upgrade anthropic SDK to fix httpx proxies compat 2026-02-19 20:13:41 +08:00
Min Zeya Phyo
4cb978cc22 Fix missing Optional import in compiler.py 2026-02-19 20:10:34 +08:00
Min Zeya Phyo
9d7e028550 Fix scraper: use newspaper4k, handle all RSS sources 2026-02-19 19:34:14 +08:00
Min Zeya Phyo
879fdc3849 Add lxml_html_clean dep for newspaper3k compat 2026-02-19 19:31:54 +08:00
Min Zeya Phyo
ba2c7955f4 Add backend pipeline Dockerfile with lightweight deps 2026-02-19 19:18:35 +08:00
6 changed files with 59 additions and 3 deletions

25
backend/Dockerfile Normal file
View File

@@ -0,0 +1,25 @@
FROM python:3.11-slim
WORKDIR /app
# Install system dependencies for newspaper3k and psycopg2
RUN apt-get update && apt-get install -y --no-install-recommends \
gcc \
libxml2-dev \
libxslt1-dev \
libjpeg-dev \
zlib1g-dev \
libpq-dev \
&& rm -rf /var/lib/apt/lists/*
# Install Python dependencies
COPY requirements-pipeline.txt ./requirements.txt
RUN pip install --no-cache-dir -r requirements.txt
# Download NLTK data needed by newspaper3k
RUN python -c "import nltk; nltk.download('punkt_tab', quiet=True)"
# Copy application code
COPY . .
CMD ["python", "run_pipeline.py"]

View File

@@ -1,6 +1,6 @@
# Article compilation module - Groups and merges related articles # Article compilation module - Groups and merges related articles
from typing import List, Dict, Tuple from typing import List, Dict, Tuple, Optional
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity from sklearn.metrics.pairwise import cosine_similarity
from loguru import logger from loguru import logger

View File

@@ -80,7 +80,7 @@ CATEGORY_KEYWORDS = {
# Translation settings # Translation settings
TRANSLATION = { TRANSLATION = {
'model': 'claude-3-5-sonnet-20241022', 'model': os.getenv('CLAUDE_MODEL', 'claude-3-haiku-20240307'),
'max_tokens': 4000, 'max_tokens': 4000,
'temperature': 0.5, # Higher = more natural, casual translation 'temperature': 0.5, # Higher = more natural, casual translation
'preserve_terms': [ # Technical terms to keep in English 'preserve_terms': [ # Technical terms to keep in English

View File

@@ -0,0 +1,29 @@
# Burmddit Pipeline - Lightweight requirements (no PyTorch/Scrapy)
# Web scraping
beautifulsoup4==4.12.3
requests==2.31.0
feedparser==6.0.11
newspaper4k>=0.9.3
lxml_html_clean
# Database
psycopg2-binary==2.9.9
# AI (Claude for translation/compilation)
anthropic>=0.40.0
# Text processing
scikit-learn==1.4.0
python-slugify==8.0.2
markdown==3.5.2
bleach==6.1.0
# Utilities
python-dotenv==1.0.1
python-dateutil==2.8.2
pytz==2024.1
pyyaml==6.0.1
# Logging
loguru==0.7.2

View File

@@ -31,7 +31,7 @@ class AINewsScraper:
try: try:
if source_name == 'medium': if source_name == 'medium':
articles = self.scrape_medium(source_config) articles = self.scrape_medium(source_config)
elif source_name in ['techcrunch', 'venturebeat', 'mit_tech_review']: elif 'url' in source_config:
articles = self.scrape_rss_feed(source_config) articles = self.scrape_rss_feed(source_config)
else: else:
logger.warning(f"Unknown source: {source_name}") logger.warning(f"Unknown source: {source_name}")

View File

@@ -1,3 +1,5 @@
'use client'
import Link from 'next/link' import Link from 'next/link'
import Image from 'next/image' import Image from 'next/image'