# Article compilation module - Groups and merges related articles from typing import List, Dict, Tuple, Optional from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from loguru import logger import anthropic import config import database import time class ArticleCompiler: def __init__(self): self.client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY) def compile_articles(self, num_articles: int = None) -> List[Dict]: """Main compilation pipeline""" if num_articles is None: num_articles = config.PIPELINE['articles_per_day'] # Get unprocessed articles from database raw_articles = database.get_unprocessed_articles(limit=100) if not raw_articles: logger.warning("No unprocessed articles found") return [] logger.info(f"Processing {len(raw_articles)} raw articles") # Cluster similar articles clusters = self.cluster_articles(raw_articles, num_clusters=num_articles) # Compile each cluster into one comprehensive article compiled_articles = [] for i, cluster in enumerate(clusters): try: logger.info(f"Compiling cluster {i+1}/{len(clusters)} with {len(cluster)} articles") compiled = self.compile_cluster(cluster) if compiled: compiled_articles.append(compiled) time.sleep(1) # Rate limiting except Exception as e: logger.error(f"Error compiling cluster {i+1}: {e}") continue logger.info(f"Compiled {len(compiled_articles)} articles") return compiled_articles def cluster_articles(self, articles: List[Dict], num_clusters: int) -> List[List[Dict]]: """Cluster articles by similarity""" if len(articles) <= num_clusters: return [[article] for article in articles] # Extract text for vectorization texts = [ f"{article['title']} {article['content'][:500]}" for article in articles ] # TF-IDF vectorization vectorizer = TfidfVectorizer(max_features=100, stop_words='english') tfidf_matrix = vectorizer.fit_transform(texts) # Calculate similarity matrix similarity_matrix = cosine_similarity(tfidf_matrix) # Simple clustering: greedy approach # Find most similar articles and group them clusters = [] used_indices = set() for i in range(len(articles)): if i in used_indices: continue # Find similar articles (above threshold) similar_indices = [] for j in range(len(articles)): if j != i and j not in used_indices: if similarity_matrix[i][j] >= config.PIPELINE['clustering_threshold']: similar_indices.append(j) # Create cluster cluster = [articles[i]] for idx in similar_indices[:config.PIPELINE['sources_per_article']-1]: # Limit cluster size cluster.append(articles[idx]) used_indices.add(idx) clusters.append(cluster) used_indices.add(i) if len(clusters) >= num_clusters: break # If we don't have enough clusters, add remaining articles individually while len(clusters) < num_clusters and len(used_indices) < len(articles): for i, article in enumerate(articles): if i not in used_indices: clusters.append([article]) used_indices.add(i) break logger.info(f"Created {len(clusters)} clusters from {len(articles)} articles") return clusters def compile_cluster(self, cluster: List[Dict]) -> Optional[Dict]: """Compile multiple articles into one comprehensive piece""" if not cluster: return None # If only one article, use it directly (with some enhancement) if len(cluster) == 1: return self.enhance_single_article(cluster[0]) # Prepare source summaries sources_text = "" for i, article in enumerate(cluster, 1): sources_text += f"\n\n## Source {i}: {article['title']}\n" sources_text += f"URL: {article['url']}\n" sources_text += f"Content: {article['content'][:1000]}...\n" # First 1000 chars # Use Claude to compile articles prompt = f"""You are a friendly tech blogger writing for everyday people who are curious about AI but not tech experts. Compile these {len(cluster)} related AI articles into ONE easy-to-read, engaging article. {sources_text} 🎯 CRITICAL REQUIREMENTS: WRITING STYLE: 1. Write in SIMPLE, CASUAL language - like explaining to a friend 2. Use SHORT SENTENCES - easy to scan on mobile 3. AVOID JARGON - or explain it simply in parentheses 4. Use REAL-WORLD EXAMPLES and ANALOGIES 5. Make it FUN and ENGAGING - not boring or academic 6. Use active voice, not passive 7. Address readers directly ("you", "we") CONTENT STRUCTURE: 1. Catchy, clear title (no clickbait, but interesting) 2. Hook opening: "Why should I care about this?" 3. Clear sections with descriptive subheadings 4. Key facts highlighted with bullet points 5. "What this means for you" sections 6. Brief, satisfying conclusion EXAMPLES TO FOLLOW: ❌ Bad: "The implementation of advanced neural architectures facilitates..." ✅ Good: "New AI systems use smarter brain-like networks to..." ❌ Bad: "Anthropomorphic large language models demonstrate emergent capabilities..." ✅ Good: "ChatGPT-like AI is learning new tricks on its own..." TARGET: Myanmar general public (will be translated to Burmese) LENGTH: {config.PIPELINE['min_article_length']}-{config.PIPELINE['max_article_length']} words (shorter is better!) Format the output as: TITLE: [Engaging, clear title] EXCERPT: [2-sentence casual summary that makes people want to read] CONTENT: [Your easy-to-read article with markdown formatting] SOURCES: [List of original URLs] """ try: message = self.client.messages.create( model=config.TRANSLATION['model'], max_tokens=config.TRANSLATION['max_tokens'], temperature=0.5, # Slightly higher for creative writing messages=[{"role": "user", "content": prompt}] ) response = message.content[0].text # Parse response compiled = self.parse_compiled_article(response, cluster) return compiled except Exception as e: logger.error(f"Error compiling with Claude: {e}") return None def enhance_single_article(self, article: Dict) -> Dict: """Enhance a single article (format, clean up, add structure)""" return { 'title': article['title'], 'content': article['content'], 'excerpt': article['content'][:200] + '...', 'source_articles': [ { 'url': article['url'], 'title': article['title'], 'author': article['author'] } ], 'category_hint': article.get('category_hint'), 'featured_image': article.get('top_image') } def parse_compiled_article(self, response: str, cluster: List[Dict]) -> Dict: """Parse Claude's response into structured article""" lines = response.strip().split('\n') title = "" excerpt = "" content = "" current_section = None for line in lines: if line.startswith('TITLE:'): title = line.replace('TITLE:', '').strip() current_section = 'title' elif line.startswith('EXCERPT:'): excerpt = line.replace('EXCERPT:', '').strip() current_section = 'excerpt' elif line.startswith('CONTENT:'): current_section = 'content' elif line.startswith('SOURCES:'): current_section = 'sources' elif current_section == 'content': content += line + '\n' # Fallback if parsing fails if not title: title = cluster[0]['title'] if not excerpt: excerpt = content[:200] + '...' if content else cluster[0]['content'][:200] + '...' if not content: content = response # Build source articles list source_articles = [ { 'url': article['url'], 'title': article['title'], 'author': article['author'] } for article in cluster ] # Collect all images from cluster all_images = [] for article in cluster: if article.get('images'): all_images.extend(article['images']) elif article.get('top_image'): all_images.append(article['top_image']) # Remove duplicates, keep first 5 unique_images = [] for img in all_images: if img and img not in unique_images: unique_images.append(img) if len(unique_images) >= 5: break # Collect all videos from cluster all_videos = [] for article in cluster: if article.get('videos'): all_videos.extend(article['videos']) # Remove duplicates unique_videos = list(set([v for v in all_videos if v]))[:3] # Max 3 videos # Detect category category_hint = cluster[0].get('category_hint') or database.detect_category(title, content) return { 'title': title.strip(), 'content': content.strip(), 'excerpt': excerpt.strip(), 'source_articles': source_articles, 'category_hint': category_hint, 'featured_image': unique_images[0] if unique_images else None, 'images': unique_images, # 🔥 All images 'videos': unique_videos # 🔥 All videos } def run_compiler(): """Main compiler execution""" logger.info("Starting compiler...") start_time = time.time() try: compiler = ArticleCompiler() compiled_articles = compiler.compile_articles() duration = int(time.time() - start_time) database.log_pipeline_stage( stage='compile', status='completed', articles_processed=len(compiled_articles), duration=duration ) logger.info(f"Compiler completed in {duration}s. Articles compiled: {len(compiled_articles)}") return compiled_articles except Exception as e: logger.error(f"Compiler failed: {e}") database.log_pipeline_stage( stage='compile', status='failed', error_message=str(e) ) return [] if __name__ == '__main__': from loguru import logger logger.add(config.LOG_FILE, rotation="1 day") compiled = run_compiler() print(f"Compiled {len(compiled)} articles")