forked from minzeyaphyo/burmddit
Compare commits
7 Commits
98af1c7cec
...
afa8fb8d78
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
afa8fb8d78 | ||
|
|
4829f15010 | ||
|
|
4ab83ba420 | ||
|
|
4cb978cc22 | ||
|
|
9d7e028550 | ||
|
|
879fdc3849 | ||
|
|
ba2c7955f4 |
25
backend/Dockerfile
Normal file
25
backend/Dockerfile
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Install system dependencies for newspaper3k and psycopg2
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
gcc \
|
||||||
|
libxml2-dev \
|
||||||
|
libxslt1-dev \
|
||||||
|
libjpeg-dev \
|
||||||
|
zlib1g-dev \
|
||||||
|
libpq-dev \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Install Python dependencies
|
||||||
|
COPY requirements-pipeline.txt ./requirements.txt
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
# Download NLTK data needed by newspaper3k
|
||||||
|
RUN python -c "import nltk; nltk.download('punkt_tab', quiet=True)"
|
||||||
|
|
||||||
|
# Copy application code
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
CMD ["python", "run_pipeline.py"]
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
# Article compilation module - Groups and merges related articles
|
# Article compilation module - Groups and merges related articles
|
||||||
|
|
||||||
from typing import List, Dict, Tuple
|
from typing import List, Dict, Tuple, Optional
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
from sklearn.metrics.pairwise import cosine_similarity
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|||||||
@@ -80,7 +80,7 @@ CATEGORY_KEYWORDS = {
|
|||||||
|
|
||||||
# Translation settings
|
# Translation settings
|
||||||
TRANSLATION = {
|
TRANSLATION = {
|
||||||
'model': 'claude-3-5-sonnet-20241022',
|
'model': os.getenv('CLAUDE_MODEL', 'claude-3-haiku-20240307'),
|
||||||
'max_tokens': 4000,
|
'max_tokens': 4000,
|
||||||
'temperature': 0.5, # Higher = more natural, casual translation
|
'temperature': 0.5, # Higher = more natural, casual translation
|
||||||
'preserve_terms': [ # Technical terms to keep in English
|
'preserve_terms': [ # Technical terms to keep in English
|
||||||
|
|||||||
29
backend/requirements-pipeline.txt
Normal file
29
backend/requirements-pipeline.txt
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
# Burmddit Pipeline - Lightweight requirements (no PyTorch/Scrapy)
|
||||||
|
|
||||||
|
# Web scraping
|
||||||
|
beautifulsoup4==4.12.3
|
||||||
|
requests==2.31.0
|
||||||
|
feedparser==6.0.11
|
||||||
|
newspaper4k>=0.9.3
|
||||||
|
lxml_html_clean
|
||||||
|
|
||||||
|
# Database
|
||||||
|
psycopg2-binary==2.9.9
|
||||||
|
|
||||||
|
# AI (Claude for translation/compilation)
|
||||||
|
anthropic>=0.40.0
|
||||||
|
|
||||||
|
# Text processing
|
||||||
|
scikit-learn==1.4.0
|
||||||
|
python-slugify==8.0.2
|
||||||
|
markdown==3.5.2
|
||||||
|
bleach==6.1.0
|
||||||
|
|
||||||
|
# Utilities
|
||||||
|
python-dotenv==1.0.1
|
||||||
|
python-dateutil==2.8.2
|
||||||
|
pytz==2024.1
|
||||||
|
pyyaml==6.0.1
|
||||||
|
|
||||||
|
# Logging
|
||||||
|
loguru==0.7.2
|
||||||
@@ -31,7 +31,7 @@ class AINewsScraper:
|
|||||||
try:
|
try:
|
||||||
if source_name == 'medium':
|
if source_name == 'medium':
|
||||||
articles = self.scrape_medium(source_config)
|
articles = self.scrape_medium(source_config)
|
||||||
elif source_name in ['techcrunch', 'venturebeat', 'mit_tech_review']:
|
elif 'url' in source_config:
|
||||||
articles = self.scrape_rss_feed(source_config)
|
articles = self.scrape_rss_feed(source_config)
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Unknown source: {source_name}")
|
logger.warning(f"Unknown source: {source_name}")
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
'use client'
|
||||||
|
|
||||||
import Link from 'next/link'
|
import Link from 'next/link'
|
||||||
import Image from 'next/image'
|
import Image from 'next/image'
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user