Initial Burmddit deployment - AI news aggregator in Burmese
This commit is contained in:
319
backend/compiler.py
Normal file
319
backend/compiler.py
Normal file
@@ -0,0 +1,319 @@
|
||||
# Article compilation module - Groups and merges related articles
|
||||
|
||||
from typing import List, Dict, Tuple
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
from loguru import logger
|
||||
import anthropic
|
||||
import config
|
||||
import database
|
||||
import time
|
||||
|
||||
class ArticleCompiler:
|
||||
def __init__(self):
|
||||
self.client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)
|
||||
|
||||
def compile_articles(self, num_articles: int = None) -> List[Dict]:
|
||||
"""Main compilation pipeline"""
|
||||
if num_articles is None:
|
||||
num_articles = config.PIPELINE['articles_per_day']
|
||||
|
||||
# Get unprocessed articles from database
|
||||
raw_articles = database.get_unprocessed_articles(limit=100)
|
||||
|
||||
if not raw_articles:
|
||||
logger.warning("No unprocessed articles found")
|
||||
return []
|
||||
|
||||
logger.info(f"Processing {len(raw_articles)} raw articles")
|
||||
|
||||
# Cluster similar articles
|
||||
clusters = self.cluster_articles(raw_articles, num_clusters=num_articles)
|
||||
|
||||
# Compile each cluster into one comprehensive article
|
||||
compiled_articles = []
|
||||
for i, cluster in enumerate(clusters):
|
||||
try:
|
||||
logger.info(f"Compiling cluster {i+1}/{len(clusters)} with {len(cluster)} articles")
|
||||
compiled = self.compile_cluster(cluster)
|
||||
|
||||
if compiled:
|
||||
compiled_articles.append(compiled)
|
||||
|
||||
time.sleep(1) # Rate limiting
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error compiling cluster {i+1}: {e}")
|
||||
continue
|
||||
|
||||
logger.info(f"Compiled {len(compiled_articles)} articles")
|
||||
return compiled_articles
|
||||
|
||||
def cluster_articles(self, articles: List[Dict], num_clusters: int) -> List[List[Dict]]:
|
||||
"""Cluster articles by similarity"""
|
||||
if len(articles) <= num_clusters:
|
||||
return [[article] for article in articles]
|
||||
|
||||
# Extract text for vectorization
|
||||
texts = [
|
||||
f"{article['title']} {article['content'][:500]}"
|
||||
for article in articles
|
||||
]
|
||||
|
||||
# TF-IDF vectorization
|
||||
vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
|
||||
tfidf_matrix = vectorizer.fit_transform(texts)
|
||||
|
||||
# Calculate similarity matrix
|
||||
similarity_matrix = cosine_similarity(tfidf_matrix)
|
||||
|
||||
# Simple clustering: greedy approach
|
||||
# Find most similar articles and group them
|
||||
clusters = []
|
||||
used_indices = set()
|
||||
|
||||
for i in range(len(articles)):
|
||||
if i in used_indices:
|
||||
continue
|
||||
|
||||
# Find similar articles (above threshold)
|
||||
similar_indices = []
|
||||
for j in range(len(articles)):
|
||||
if j != i and j not in used_indices:
|
||||
if similarity_matrix[i][j] >= config.PIPELINE['clustering_threshold']:
|
||||
similar_indices.append(j)
|
||||
|
||||
# Create cluster
|
||||
cluster = [articles[i]]
|
||||
for idx in similar_indices[:config.PIPELINE['sources_per_article']-1]: # Limit cluster size
|
||||
cluster.append(articles[idx])
|
||||
used_indices.add(idx)
|
||||
|
||||
clusters.append(cluster)
|
||||
used_indices.add(i)
|
||||
|
||||
if len(clusters) >= num_clusters:
|
||||
break
|
||||
|
||||
# If we don't have enough clusters, add remaining articles individually
|
||||
while len(clusters) < num_clusters and len(used_indices) < len(articles):
|
||||
for i, article in enumerate(articles):
|
||||
if i not in used_indices:
|
||||
clusters.append([article])
|
||||
used_indices.add(i)
|
||||
break
|
||||
|
||||
logger.info(f"Created {len(clusters)} clusters from {len(articles)} articles")
|
||||
return clusters
|
||||
|
||||
def compile_cluster(self, cluster: List[Dict]) -> Optional[Dict]:
|
||||
"""Compile multiple articles into one comprehensive piece"""
|
||||
if not cluster:
|
||||
return None
|
||||
|
||||
# If only one article, use it directly (with some enhancement)
|
||||
if len(cluster) == 1:
|
||||
return self.enhance_single_article(cluster[0])
|
||||
|
||||
# Prepare source summaries
|
||||
sources_text = ""
|
||||
for i, article in enumerate(cluster, 1):
|
||||
sources_text += f"\n\n## Source {i}: {article['title']}\n"
|
||||
sources_text += f"URL: {article['url']}\n"
|
||||
sources_text += f"Content: {article['content'][:1000]}...\n" # First 1000 chars
|
||||
|
||||
# Use Claude to compile articles
|
||||
prompt = f"""You are a friendly tech blogger writing for everyday people who are curious about AI but not tech experts. Compile these {len(cluster)} related AI articles into ONE easy-to-read, engaging article.
|
||||
|
||||
{sources_text}
|
||||
|
||||
🎯 CRITICAL REQUIREMENTS:
|
||||
|
||||
WRITING STYLE:
|
||||
1. Write in SIMPLE, CASUAL language - like explaining to a friend
|
||||
2. Use SHORT SENTENCES - easy to scan on mobile
|
||||
3. AVOID JARGON - or explain it simply in parentheses
|
||||
4. Use REAL-WORLD EXAMPLES and ANALOGIES
|
||||
5. Make it FUN and ENGAGING - not boring or academic
|
||||
6. Use active voice, not passive
|
||||
7. Address readers directly ("you", "we")
|
||||
|
||||
CONTENT STRUCTURE:
|
||||
1. Catchy, clear title (no clickbait, but interesting)
|
||||
2. Hook opening: "Why should I care about this?"
|
||||
3. Clear sections with descriptive subheadings
|
||||
4. Key facts highlighted with bullet points
|
||||
5. "What this means for you" sections
|
||||
6. Brief, satisfying conclusion
|
||||
|
||||
EXAMPLES TO FOLLOW:
|
||||
❌ Bad: "The implementation of advanced neural architectures facilitates..."
|
||||
✅ Good: "New AI systems use smarter brain-like networks to..."
|
||||
|
||||
❌ Bad: "Anthropomorphic large language models demonstrate emergent capabilities..."
|
||||
✅ Good: "ChatGPT-like AI is learning new tricks on its own..."
|
||||
|
||||
TARGET: Myanmar general public (will be translated to Burmese)
|
||||
LENGTH: {config.PIPELINE['min_article_length']}-{config.PIPELINE['max_article_length']} words (shorter is better!)
|
||||
|
||||
Format the output as:
|
||||
TITLE: [Engaging, clear title]
|
||||
|
||||
EXCERPT: [2-sentence casual summary that makes people want to read]
|
||||
|
||||
CONTENT:
|
||||
[Your easy-to-read article with markdown formatting]
|
||||
|
||||
SOURCES: [List of original URLs]
|
||||
"""
|
||||
|
||||
try:
|
||||
message = self.client.messages.create(
|
||||
model=config.TRANSLATION['model'],
|
||||
max_tokens=config.TRANSLATION['max_tokens'],
|
||||
temperature=0.5, # Slightly higher for creative writing
|
||||
messages=[{"role": "user", "content": prompt}]
|
||||
)
|
||||
|
||||
response = message.content[0].text
|
||||
|
||||
# Parse response
|
||||
compiled = self.parse_compiled_article(response, cluster)
|
||||
return compiled
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error compiling with Claude: {e}")
|
||||
return None
|
||||
|
||||
def enhance_single_article(self, article: Dict) -> Dict:
|
||||
"""Enhance a single article (format, clean up, add structure)"""
|
||||
return {
|
||||
'title': article['title'],
|
||||
'content': article['content'],
|
||||
'excerpt': article['content'][:200] + '...',
|
||||
'source_articles': [
|
||||
{
|
||||
'url': article['url'],
|
||||
'title': article['title'],
|
||||
'author': article['author']
|
||||
}
|
||||
],
|
||||
'category_hint': article.get('category_hint'),
|
||||
'featured_image': article.get('top_image')
|
||||
}
|
||||
|
||||
def parse_compiled_article(self, response: str, cluster: List[Dict]) -> Dict:
|
||||
"""Parse Claude's response into structured article"""
|
||||
lines = response.strip().split('\n')
|
||||
|
||||
title = ""
|
||||
excerpt = ""
|
||||
content = ""
|
||||
|
||||
current_section = None
|
||||
|
||||
for line in lines:
|
||||
if line.startswith('TITLE:'):
|
||||
title = line.replace('TITLE:', '').strip()
|
||||
current_section = 'title'
|
||||
elif line.startswith('EXCERPT:'):
|
||||
excerpt = line.replace('EXCERPT:', '').strip()
|
||||
current_section = 'excerpt'
|
||||
elif line.startswith('CONTENT:'):
|
||||
current_section = 'content'
|
||||
elif line.startswith('SOURCES:'):
|
||||
current_section = 'sources'
|
||||
elif current_section == 'content':
|
||||
content += line + '\n'
|
||||
|
||||
# Fallback if parsing fails
|
||||
if not title:
|
||||
title = cluster[0]['title']
|
||||
if not excerpt:
|
||||
excerpt = content[:200] + '...' if content else cluster[0]['content'][:200] + '...'
|
||||
if not content:
|
||||
content = response
|
||||
|
||||
# Build source articles list
|
||||
source_articles = [
|
||||
{
|
||||
'url': article['url'],
|
||||
'title': article['title'],
|
||||
'author': article['author']
|
||||
}
|
||||
for article in cluster
|
||||
]
|
||||
|
||||
# Collect all images from cluster
|
||||
all_images = []
|
||||
for article in cluster:
|
||||
if article.get('images'):
|
||||
all_images.extend(article['images'])
|
||||
elif article.get('top_image'):
|
||||
all_images.append(article['top_image'])
|
||||
|
||||
# Remove duplicates, keep first 5
|
||||
unique_images = []
|
||||
for img in all_images:
|
||||
if img and img not in unique_images:
|
||||
unique_images.append(img)
|
||||
if len(unique_images) >= 5:
|
||||
break
|
||||
|
||||
# Collect all videos from cluster
|
||||
all_videos = []
|
||||
for article in cluster:
|
||||
if article.get('videos'):
|
||||
all_videos.extend(article['videos'])
|
||||
|
||||
# Remove duplicates
|
||||
unique_videos = list(set([v for v in all_videos if v]))[:3] # Max 3 videos
|
||||
|
||||
# Detect category
|
||||
category_hint = cluster[0].get('category_hint') or database.detect_category(title, content)
|
||||
|
||||
return {
|
||||
'title': title.strip(),
|
||||
'content': content.strip(),
|
||||
'excerpt': excerpt.strip(),
|
||||
'source_articles': source_articles,
|
||||
'category_hint': category_hint,
|
||||
'featured_image': unique_images[0] if unique_images else None,
|
||||
'images': unique_images, # 🔥 All images
|
||||
'videos': unique_videos # 🔥 All videos
|
||||
}
|
||||
|
||||
def run_compiler():
|
||||
"""Main compiler execution"""
|
||||
logger.info("Starting compiler...")
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
compiler = ArticleCompiler()
|
||||
compiled_articles = compiler.compile_articles()
|
||||
|
||||
duration = int(time.time() - start_time)
|
||||
database.log_pipeline_stage(
|
||||
stage='compile',
|
||||
status='completed',
|
||||
articles_processed=len(compiled_articles),
|
||||
duration=duration
|
||||
)
|
||||
|
||||
logger.info(f"Compiler completed in {duration}s. Articles compiled: {len(compiled_articles)}")
|
||||
return compiled_articles
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Compiler failed: {e}")
|
||||
database.log_pipeline_stage(
|
||||
stage='compile',
|
||||
status='failed',
|
||||
error_message=str(e)
|
||||
)
|
||||
return []
|
||||
|
||||
if __name__ == '__main__':
|
||||
from loguru import logger
|
||||
logger.add(config.LOG_FILE, rotation="1 day")
|
||||
compiled = run_compiler()
|
||||
print(f"Compiled {len(compiled)} articles")
|
||||
Reference in New Issue
Block a user