Files
burmddit/backend/compiler.py
2026-02-19 20:10:34 +08:00

320 lines
11 KiB
Python

# Article compilation module - Groups and merges related articles
from typing import List, Dict, Tuple, Optional
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from loguru import logger
import anthropic
import config
import database
import time
class ArticleCompiler:
def __init__(self):
self.client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)
def compile_articles(self, num_articles: int = None) -> List[Dict]:
"""Main compilation pipeline"""
if num_articles is None:
num_articles = config.PIPELINE['articles_per_day']
# Get unprocessed articles from database
raw_articles = database.get_unprocessed_articles(limit=100)
if not raw_articles:
logger.warning("No unprocessed articles found")
return []
logger.info(f"Processing {len(raw_articles)} raw articles")
# Cluster similar articles
clusters = self.cluster_articles(raw_articles, num_clusters=num_articles)
# Compile each cluster into one comprehensive article
compiled_articles = []
for i, cluster in enumerate(clusters):
try:
logger.info(f"Compiling cluster {i+1}/{len(clusters)} with {len(cluster)} articles")
compiled = self.compile_cluster(cluster)
if compiled:
compiled_articles.append(compiled)
time.sleep(1) # Rate limiting
except Exception as e:
logger.error(f"Error compiling cluster {i+1}: {e}")
continue
logger.info(f"Compiled {len(compiled_articles)} articles")
return compiled_articles
def cluster_articles(self, articles: List[Dict], num_clusters: int) -> List[List[Dict]]:
"""Cluster articles by similarity"""
if len(articles) <= num_clusters:
return [[article] for article in articles]
# Extract text for vectorization
texts = [
f"{article['title']} {article['content'][:500]}"
for article in articles
]
# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(texts)
# Calculate similarity matrix
similarity_matrix = cosine_similarity(tfidf_matrix)
# Simple clustering: greedy approach
# Find most similar articles and group them
clusters = []
used_indices = set()
for i in range(len(articles)):
if i in used_indices:
continue
# Find similar articles (above threshold)
similar_indices = []
for j in range(len(articles)):
if j != i and j not in used_indices:
if similarity_matrix[i][j] >= config.PIPELINE['clustering_threshold']:
similar_indices.append(j)
# Create cluster
cluster = [articles[i]]
for idx in similar_indices[:config.PIPELINE['sources_per_article']-1]: # Limit cluster size
cluster.append(articles[idx])
used_indices.add(idx)
clusters.append(cluster)
used_indices.add(i)
if len(clusters) >= num_clusters:
break
# If we don't have enough clusters, add remaining articles individually
while len(clusters) < num_clusters and len(used_indices) < len(articles):
for i, article in enumerate(articles):
if i not in used_indices:
clusters.append([article])
used_indices.add(i)
break
logger.info(f"Created {len(clusters)} clusters from {len(articles)} articles")
return clusters
def compile_cluster(self, cluster: List[Dict]) -> Optional[Dict]:
"""Compile multiple articles into one comprehensive piece"""
if not cluster:
return None
# If only one article, use it directly (with some enhancement)
if len(cluster) == 1:
return self.enhance_single_article(cluster[0])
# Prepare source summaries
sources_text = ""
for i, article in enumerate(cluster, 1):
sources_text += f"\n\n## Source {i}: {article['title']}\n"
sources_text += f"URL: {article['url']}\n"
sources_text += f"Content: {article['content'][:1000]}...\n" # First 1000 chars
# Use Claude to compile articles
prompt = f"""You are a friendly tech blogger writing for everyday people who are curious about AI but not tech experts. Compile these {len(cluster)} related AI articles into ONE easy-to-read, engaging article.
{sources_text}
🎯 CRITICAL REQUIREMENTS:
WRITING STYLE:
1. Write in SIMPLE, CASUAL language - like explaining to a friend
2. Use SHORT SENTENCES - easy to scan on mobile
3. AVOID JARGON - or explain it simply in parentheses
4. Use REAL-WORLD EXAMPLES and ANALOGIES
5. Make it FUN and ENGAGING - not boring or academic
6. Use active voice, not passive
7. Address readers directly ("you", "we")
CONTENT STRUCTURE:
1. Catchy, clear title (no clickbait, but interesting)
2. Hook opening: "Why should I care about this?"
3. Clear sections with descriptive subheadings
4. Key facts highlighted with bullet points
5. "What this means for you" sections
6. Brief, satisfying conclusion
EXAMPLES TO FOLLOW:
❌ Bad: "The implementation of advanced neural architectures facilitates..."
✅ Good: "New AI systems use smarter brain-like networks to..."
❌ Bad: "Anthropomorphic large language models demonstrate emergent capabilities..."
✅ Good: "ChatGPT-like AI is learning new tricks on its own..."
TARGET: Myanmar general public (will be translated to Burmese)
LENGTH: {config.PIPELINE['min_article_length']}-{config.PIPELINE['max_article_length']} words (shorter is better!)
Format the output as:
TITLE: [Engaging, clear title]
EXCERPT: [2-sentence casual summary that makes people want to read]
CONTENT:
[Your easy-to-read article with markdown formatting]
SOURCES: [List of original URLs]
"""
try:
message = self.client.messages.create(
model=config.TRANSLATION['model'],
max_tokens=config.TRANSLATION['max_tokens'],
temperature=0.5, # Slightly higher for creative writing
messages=[{"role": "user", "content": prompt}]
)
response = message.content[0].text
# Parse response
compiled = self.parse_compiled_article(response, cluster)
return compiled
except Exception as e:
logger.error(f"Error compiling with Claude: {e}")
return None
def enhance_single_article(self, article: Dict) -> Dict:
"""Enhance a single article (format, clean up, add structure)"""
return {
'title': article['title'],
'content': article['content'],
'excerpt': article['content'][:200] + '...',
'source_articles': [
{
'url': article['url'],
'title': article['title'],
'author': article['author']
}
],
'category_hint': article.get('category_hint'),
'featured_image': article.get('top_image')
}
def parse_compiled_article(self, response: str, cluster: List[Dict]) -> Dict:
"""Parse Claude's response into structured article"""
lines = response.strip().split('\n')
title = ""
excerpt = ""
content = ""
current_section = None
for line in lines:
if line.startswith('TITLE:'):
title = line.replace('TITLE:', '').strip()
current_section = 'title'
elif line.startswith('EXCERPT:'):
excerpt = line.replace('EXCERPT:', '').strip()
current_section = 'excerpt'
elif line.startswith('CONTENT:'):
current_section = 'content'
elif line.startswith('SOURCES:'):
current_section = 'sources'
elif current_section == 'content':
content += line + '\n'
# Fallback if parsing fails
if not title:
title = cluster[0]['title']
if not excerpt:
excerpt = content[:200] + '...' if content else cluster[0]['content'][:200] + '...'
if not content:
content = response
# Build source articles list
source_articles = [
{
'url': article['url'],
'title': article['title'],
'author': article['author']
}
for article in cluster
]
# Collect all images from cluster
all_images = []
for article in cluster:
if article.get('images'):
all_images.extend(article['images'])
elif article.get('top_image'):
all_images.append(article['top_image'])
# Remove duplicates, keep first 5
unique_images = []
for img in all_images:
if img and img not in unique_images:
unique_images.append(img)
if len(unique_images) >= 5:
break
# Collect all videos from cluster
all_videos = []
for article in cluster:
if article.get('videos'):
all_videos.extend(article['videos'])
# Remove duplicates
unique_videos = list(set([v for v in all_videos if v]))[:3] # Max 3 videos
# Detect category
category_hint = cluster[0].get('category_hint') or database.detect_category(title, content)
return {
'title': title.strip(),
'content': content.strip(),
'excerpt': excerpt.strip(),
'source_articles': source_articles,
'category_hint': category_hint,
'featured_image': unique_images[0] if unique_images else None,
'images': unique_images, # 🔥 All images
'videos': unique_videos # 🔥 All videos
}
def run_compiler():
"""Main compiler execution"""
logger.info("Starting compiler...")
start_time = time.time()
try:
compiler = ArticleCompiler()
compiled_articles = compiler.compile_articles()
duration = int(time.time() - start_time)
database.log_pipeline_stage(
stage='compile',
status='completed',
articles_processed=len(compiled_articles),
duration=duration
)
logger.info(f"Compiler completed in {duration}s. Articles compiled: {len(compiled_articles)}")
return compiled_articles
except Exception as e:
logger.error(f"Compiler failed: {e}")
database.log_pipeline_stage(
stage='compile',
status='failed',
error_message=str(e)
)
return []
if __name__ == '__main__':
from loguru import logger
logger.add(config.LOG_FILE, rotation="1 day")
compiled = run_compiler()
print(f"Compiled {len(compiled)} articles")