forked from minzeyaphyo/burmddit
Compare commits
7 Commits
98af1c7cec
...
afa8fb8d78
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
afa8fb8d78 | ||
|
|
4829f15010 | ||
|
|
4ab83ba420 | ||
|
|
4cb978cc22 | ||
|
|
9d7e028550 | ||
|
|
879fdc3849 | ||
|
|
ba2c7955f4 |
25
backend/Dockerfile
Normal file
25
backend/Dockerfile
Normal file
@@ -0,0 +1,25 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Install system dependencies for newspaper3k and psycopg2
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
gcc \
|
||||
libxml2-dev \
|
||||
libxslt1-dev \
|
||||
libjpeg-dev \
|
||||
zlib1g-dev \
|
||||
libpq-dev \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install Python dependencies
|
||||
COPY requirements-pipeline.txt ./requirements.txt
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Download NLTK data needed by newspaper3k
|
||||
RUN python -c "import nltk; nltk.download('punkt_tab', quiet=True)"
|
||||
|
||||
# Copy application code
|
||||
COPY . .
|
||||
|
||||
CMD ["python", "run_pipeline.py"]
|
||||
@@ -1,6 +1,6 @@
|
||||
# Article compilation module - Groups and merges related articles
|
||||
|
||||
from typing import List, Dict, Tuple
|
||||
from typing import List, Dict, Tuple, Optional
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
from loguru import logger
|
||||
|
||||
@@ -80,7 +80,7 @@ CATEGORY_KEYWORDS = {
|
||||
|
||||
# Translation settings
|
||||
TRANSLATION = {
|
||||
'model': 'claude-3-5-sonnet-20241022',
|
||||
'model': os.getenv('CLAUDE_MODEL', 'claude-3-haiku-20240307'),
|
||||
'max_tokens': 4000,
|
||||
'temperature': 0.5, # Higher = more natural, casual translation
|
||||
'preserve_terms': [ # Technical terms to keep in English
|
||||
|
||||
29
backend/requirements-pipeline.txt
Normal file
29
backend/requirements-pipeline.txt
Normal file
@@ -0,0 +1,29 @@
|
||||
# Burmddit Pipeline - Lightweight requirements (no PyTorch/Scrapy)
|
||||
|
||||
# Web scraping
|
||||
beautifulsoup4==4.12.3
|
||||
requests==2.31.0
|
||||
feedparser==6.0.11
|
||||
newspaper4k>=0.9.3
|
||||
lxml_html_clean
|
||||
|
||||
# Database
|
||||
psycopg2-binary==2.9.9
|
||||
|
||||
# AI (Claude for translation/compilation)
|
||||
anthropic>=0.40.0
|
||||
|
||||
# Text processing
|
||||
scikit-learn==1.4.0
|
||||
python-slugify==8.0.2
|
||||
markdown==3.5.2
|
||||
bleach==6.1.0
|
||||
|
||||
# Utilities
|
||||
python-dotenv==1.0.1
|
||||
python-dateutil==2.8.2
|
||||
pytz==2024.1
|
||||
pyyaml==6.0.1
|
||||
|
||||
# Logging
|
||||
loguru==0.7.2
|
||||
@@ -31,7 +31,7 @@ class AINewsScraper:
|
||||
try:
|
||||
if source_name == 'medium':
|
||||
articles = self.scrape_medium(source_config)
|
||||
elif source_name in ['techcrunch', 'venturebeat', 'mit_tech_review']:
|
||||
elif 'url' in source_config:
|
||||
articles = self.scrape_rss_feed(source_config)
|
||||
else:
|
||||
logger.warning(f"Unknown source: {source_name}")
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
'use client'
|
||||
|
||||
import Link from 'next/link'
|
||||
import Image from 'next/image'
|
||||
|
||||
|
||||
Reference in New Issue
Block a user