forked from minzeyaphyo/burmddit
320 lines
11 KiB
Python
320 lines
11 KiB
Python
# Article compilation module - Groups and merges related articles
|
|
|
|
from typing import List, Dict, Tuple, Optional
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
from loguru import logger
|
|
import anthropic
|
|
import config
|
|
import database
|
|
import time
|
|
|
|
class ArticleCompiler:
|
|
def __init__(self):
|
|
self.client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)
|
|
|
|
def compile_articles(self, num_articles: int = None) -> List[Dict]:
|
|
"""Main compilation pipeline"""
|
|
if num_articles is None:
|
|
num_articles = config.PIPELINE['articles_per_day']
|
|
|
|
# Get unprocessed articles from database
|
|
raw_articles = database.get_unprocessed_articles(limit=100)
|
|
|
|
if not raw_articles:
|
|
logger.warning("No unprocessed articles found")
|
|
return []
|
|
|
|
logger.info(f"Processing {len(raw_articles)} raw articles")
|
|
|
|
# Cluster similar articles
|
|
clusters = self.cluster_articles(raw_articles, num_clusters=num_articles)
|
|
|
|
# Compile each cluster into one comprehensive article
|
|
compiled_articles = []
|
|
for i, cluster in enumerate(clusters):
|
|
try:
|
|
logger.info(f"Compiling cluster {i+1}/{len(clusters)} with {len(cluster)} articles")
|
|
compiled = self.compile_cluster(cluster)
|
|
|
|
if compiled:
|
|
compiled_articles.append(compiled)
|
|
|
|
time.sleep(1) # Rate limiting
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error compiling cluster {i+1}: {e}")
|
|
continue
|
|
|
|
logger.info(f"Compiled {len(compiled_articles)} articles")
|
|
return compiled_articles
|
|
|
|
def cluster_articles(self, articles: List[Dict], num_clusters: int) -> List[List[Dict]]:
|
|
"""Cluster articles by similarity"""
|
|
if len(articles) <= num_clusters:
|
|
return [[article] for article in articles]
|
|
|
|
# Extract text for vectorization
|
|
texts = [
|
|
f"{article['title']} {article['content'][:500]}"
|
|
for article in articles
|
|
]
|
|
|
|
# TF-IDF vectorization
|
|
vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
|
|
tfidf_matrix = vectorizer.fit_transform(texts)
|
|
|
|
# Calculate similarity matrix
|
|
similarity_matrix = cosine_similarity(tfidf_matrix)
|
|
|
|
# Simple clustering: greedy approach
|
|
# Find most similar articles and group them
|
|
clusters = []
|
|
used_indices = set()
|
|
|
|
for i in range(len(articles)):
|
|
if i in used_indices:
|
|
continue
|
|
|
|
# Find similar articles (above threshold)
|
|
similar_indices = []
|
|
for j in range(len(articles)):
|
|
if j != i and j not in used_indices:
|
|
if similarity_matrix[i][j] >= config.PIPELINE['clustering_threshold']:
|
|
similar_indices.append(j)
|
|
|
|
# Create cluster
|
|
cluster = [articles[i]]
|
|
for idx in similar_indices[:config.PIPELINE['sources_per_article']-1]: # Limit cluster size
|
|
cluster.append(articles[idx])
|
|
used_indices.add(idx)
|
|
|
|
clusters.append(cluster)
|
|
used_indices.add(i)
|
|
|
|
if len(clusters) >= num_clusters:
|
|
break
|
|
|
|
# If we don't have enough clusters, add remaining articles individually
|
|
while len(clusters) < num_clusters and len(used_indices) < len(articles):
|
|
for i, article in enumerate(articles):
|
|
if i not in used_indices:
|
|
clusters.append([article])
|
|
used_indices.add(i)
|
|
break
|
|
|
|
logger.info(f"Created {len(clusters)} clusters from {len(articles)} articles")
|
|
return clusters
|
|
|
|
def compile_cluster(self, cluster: List[Dict]) -> Optional[Dict]:
|
|
"""Compile multiple articles into one comprehensive piece"""
|
|
if not cluster:
|
|
return None
|
|
|
|
# If only one article, use it directly (with some enhancement)
|
|
if len(cluster) == 1:
|
|
return self.enhance_single_article(cluster[0])
|
|
|
|
# Prepare source summaries
|
|
sources_text = ""
|
|
for i, article in enumerate(cluster, 1):
|
|
sources_text += f"\n\n## Source {i}: {article['title']}\n"
|
|
sources_text += f"URL: {article['url']}\n"
|
|
sources_text += f"Content: {article['content'][:1000]}...\n" # First 1000 chars
|
|
|
|
# Use Claude to compile articles
|
|
prompt = f"""You are a friendly tech blogger writing for everyday people who are curious about AI but not tech experts. Compile these {len(cluster)} related AI articles into ONE easy-to-read, engaging article.
|
|
|
|
{sources_text}
|
|
|
|
🎯 CRITICAL REQUIREMENTS:
|
|
|
|
WRITING STYLE:
|
|
1. Write in SIMPLE, CASUAL language - like explaining to a friend
|
|
2. Use SHORT SENTENCES - easy to scan on mobile
|
|
3. AVOID JARGON - or explain it simply in parentheses
|
|
4. Use REAL-WORLD EXAMPLES and ANALOGIES
|
|
5. Make it FUN and ENGAGING - not boring or academic
|
|
6. Use active voice, not passive
|
|
7. Address readers directly ("you", "we")
|
|
|
|
CONTENT STRUCTURE:
|
|
1. Catchy, clear title (no clickbait, but interesting)
|
|
2. Hook opening: "Why should I care about this?"
|
|
3. Clear sections with descriptive subheadings
|
|
4. Key facts highlighted with bullet points
|
|
5. "What this means for you" sections
|
|
6. Brief, satisfying conclusion
|
|
|
|
EXAMPLES TO FOLLOW:
|
|
❌ Bad: "The implementation of advanced neural architectures facilitates..."
|
|
✅ Good: "New AI systems use smarter brain-like networks to..."
|
|
|
|
❌ Bad: "Anthropomorphic large language models demonstrate emergent capabilities..."
|
|
✅ Good: "ChatGPT-like AI is learning new tricks on its own..."
|
|
|
|
TARGET: Myanmar general public (will be translated to Burmese)
|
|
LENGTH: {config.PIPELINE['min_article_length']}-{config.PIPELINE['max_article_length']} words (shorter is better!)
|
|
|
|
Format the output as:
|
|
TITLE: [Engaging, clear title]
|
|
|
|
EXCERPT: [2-sentence casual summary that makes people want to read]
|
|
|
|
CONTENT:
|
|
[Your easy-to-read article with markdown formatting]
|
|
|
|
SOURCES: [List of original URLs]
|
|
"""
|
|
|
|
try:
|
|
message = self.client.messages.create(
|
|
model=config.TRANSLATION['model'],
|
|
max_tokens=config.TRANSLATION['max_tokens'],
|
|
temperature=0.5, # Slightly higher for creative writing
|
|
messages=[{"role": "user", "content": prompt}]
|
|
)
|
|
|
|
response = message.content[0].text
|
|
|
|
# Parse response
|
|
compiled = self.parse_compiled_article(response, cluster)
|
|
return compiled
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error compiling with Claude: {e}")
|
|
return None
|
|
|
|
def enhance_single_article(self, article: Dict) -> Dict:
|
|
"""Enhance a single article (format, clean up, add structure)"""
|
|
return {
|
|
'title': article['title'],
|
|
'content': article['content'],
|
|
'excerpt': article['content'][:200] + '...',
|
|
'source_articles': [
|
|
{
|
|
'url': article['url'],
|
|
'title': article['title'],
|
|
'author': article['author']
|
|
}
|
|
],
|
|
'category_hint': article.get('category_hint'),
|
|
'featured_image': article.get('top_image')
|
|
}
|
|
|
|
def parse_compiled_article(self, response: str, cluster: List[Dict]) -> Dict:
|
|
"""Parse Claude's response into structured article"""
|
|
lines = response.strip().split('\n')
|
|
|
|
title = ""
|
|
excerpt = ""
|
|
content = ""
|
|
|
|
current_section = None
|
|
|
|
for line in lines:
|
|
if line.startswith('TITLE:'):
|
|
title = line.replace('TITLE:', '').strip()
|
|
current_section = 'title'
|
|
elif line.startswith('EXCERPT:'):
|
|
excerpt = line.replace('EXCERPT:', '').strip()
|
|
current_section = 'excerpt'
|
|
elif line.startswith('CONTENT:'):
|
|
current_section = 'content'
|
|
elif line.startswith('SOURCES:'):
|
|
current_section = 'sources'
|
|
elif current_section == 'content':
|
|
content += line + '\n'
|
|
|
|
# Fallback if parsing fails
|
|
if not title:
|
|
title = cluster[0]['title']
|
|
if not excerpt:
|
|
excerpt = content[:200] + '...' if content else cluster[0]['content'][:200] + '...'
|
|
if not content:
|
|
content = response
|
|
|
|
# Build source articles list
|
|
source_articles = [
|
|
{
|
|
'url': article['url'],
|
|
'title': article['title'],
|
|
'author': article['author']
|
|
}
|
|
for article in cluster
|
|
]
|
|
|
|
# Collect all images from cluster
|
|
all_images = []
|
|
for article in cluster:
|
|
if article.get('images'):
|
|
all_images.extend(article['images'])
|
|
elif article.get('top_image'):
|
|
all_images.append(article['top_image'])
|
|
|
|
# Remove duplicates, keep first 5
|
|
unique_images = []
|
|
for img in all_images:
|
|
if img and img not in unique_images:
|
|
unique_images.append(img)
|
|
if len(unique_images) >= 5:
|
|
break
|
|
|
|
# Collect all videos from cluster
|
|
all_videos = []
|
|
for article in cluster:
|
|
if article.get('videos'):
|
|
all_videos.extend(article['videos'])
|
|
|
|
# Remove duplicates
|
|
unique_videos = list(set([v for v in all_videos if v]))[:3] # Max 3 videos
|
|
|
|
# Detect category
|
|
category_hint = cluster[0].get('category_hint') or database.detect_category(title, content)
|
|
|
|
return {
|
|
'title': title.strip(),
|
|
'content': content.strip(),
|
|
'excerpt': excerpt.strip(),
|
|
'source_articles': source_articles,
|
|
'category_hint': category_hint,
|
|
'featured_image': unique_images[0] if unique_images else None,
|
|
'images': unique_images, # 🔥 All images
|
|
'videos': unique_videos # 🔥 All videos
|
|
}
|
|
|
|
def run_compiler():
|
|
"""Main compiler execution"""
|
|
logger.info("Starting compiler...")
|
|
start_time = time.time()
|
|
|
|
try:
|
|
compiler = ArticleCompiler()
|
|
compiled_articles = compiler.compile_articles()
|
|
|
|
duration = int(time.time() - start_time)
|
|
database.log_pipeline_stage(
|
|
stage='compile',
|
|
status='completed',
|
|
articles_processed=len(compiled_articles),
|
|
duration=duration
|
|
)
|
|
|
|
logger.info(f"Compiler completed in {duration}s. Articles compiled: {len(compiled_articles)}")
|
|
return compiled_articles
|
|
|
|
except Exception as e:
|
|
logger.error(f"Compiler failed: {e}")
|
|
database.log_pipeline_stage(
|
|
stage='compile',
|
|
status='failed',
|
|
error_message=str(e)
|
|
)
|
|
return []
|
|
|
|
if __name__ == '__main__':
|
|
from loguru import logger
|
|
logger.add(config.LOG_FILE, rotation="1 day")
|
|
compiled = run_compiler()
|
|
print(f"Compiled {len(compiled)} articles")
|