Initial Burmddit deployment - AI news aggregator in Burmese

2026-02-19 02:52:58 +00:00
commit dddb86ea94
27 changed files with 5039 additions and 0 deletions
--- a/backend/compiler.py
+++ b/backend/compiler.py
@@ -0,0 +1,319 @@
+# Article compilation module - Groups and merges related articles
+
+from typing import List, Dict, Tuple
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+from loguru import logger
+import anthropic
+import config
+import database
+import time
+
+class ArticleCompiler:
+    def __init__(self):
+        self.client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)
+    
+    def compile_articles(self, num_articles: int = None) -> List[Dict]:
+        """Main compilation pipeline"""
+        if num_articles is None:
+            num_articles = config.PIPELINE['articles_per_day']
+        
+        # Get unprocessed articles from database
+        raw_articles = database.get_unprocessed_articles(limit=100)
+        
+        if not raw_articles:
+            logger.warning("No unprocessed articles found")
+            return []
+        
+        logger.info(f"Processing {len(raw_articles)} raw articles")
+        
+        # Cluster similar articles
+        clusters = self.cluster_articles(raw_articles, num_clusters=num_articles)
+        
+        # Compile each cluster into one comprehensive article
+        compiled_articles = []
+        for i, cluster in enumerate(clusters):
+            try:
+                logger.info(f"Compiling cluster {i+1}/{len(clusters)} with {len(cluster)} articles")
+                compiled = self.compile_cluster(cluster)
+                
+                if compiled:
+                    compiled_articles.append(compiled)
+                
+                time.sleep(1)  # Rate limiting
+            
+            except Exception as e:
+                logger.error(f"Error compiling cluster {i+1}: {e}")
+                continue
+        
+        logger.info(f"Compiled {len(compiled_articles)} articles")
+        return compiled_articles
+    
+    def cluster_articles(self, articles: List[Dict], num_clusters: int) -> List[List[Dict]]:
+        """Cluster articles by similarity"""
+        if len(articles) <= num_clusters:
+            return [[article] for article in articles]
+        
+        # Extract text for vectorization
+        texts = [
+            f"{article['title']} {article['content'][:500]}"
+            for article in articles
+        ]
+        
+        # TF-IDF vectorization
+        vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
+        tfidf_matrix = vectorizer.fit_transform(texts)
+        
+        # Calculate similarity matrix
+        similarity_matrix = cosine_similarity(tfidf_matrix)
+        
+        # Simple clustering: greedy approach
+        # Find most similar articles and group them
+        clusters = []
+        used_indices = set()
+        
+        for i in range(len(articles)):
+            if i in used_indices:
+                continue
+            
+            # Find similar articles (above threshold)
+            similar_indices = []
+            for j in range(len(articles)):
+                if j != i and j not in used_indices:
+                    if similarity_matrix[i][j] >= config.PIPELINE['clustering_threshold']:
+                        similar_indices.append(j)
+            
+            # Create cluster
+            cluster = [articles[i]]
+            for idx in similar_indices[:config.PIPELINE['sources_per_article']-1]:  # Limit cluster size
+                cluster.append(articles[idx])
+                used_indices.add(idx)
+            
+            clusters.append(cluster)
+            used_indices.add(i)
+            
+            if len(clusters) >= num_clusters:
+                break
+        
+        # If we don't have enough clusters, add remaining articles individually
+        while len(clusters) < num_clusters and len(used_indices) < len(articles):
+            for i, article in enumerate(articles):
+                if i not in used_indices:
+                    clusters.append([article])
+                    used_indices.add(i)
+                    break
+        
+        logger.info(f"Created {len(clusters)} clusters from {len(articles)} articles")
+        return clusters
+    
+    def compile_cluster(self, cluster: List[Dict]) -> Optional[Dict]:
+        """Compile multiple articles into one comprehensive piece"""
+        if not cluster:
+            return None
+        
+        # If only one article, use it directly (with some enhancement)
+        if len(cluster) == 1:
+            return self.enhance_single_article(cluster[0])
+        
+        # Prepare source summaries
+        sources_text = ""
+        for i, article in enumerate(cluster, 1):
+            sources_text += f"\n\n## Source {i}: {article['title']}\n"
+            sources_text += f"URL: {article['url']}\n"
+            sources_text += f"Content: {article['content'][:1000]}...\n"  # First 1000 chars
+        
+        # Use Claude to compile articles
+        prompt = f"""You are a friendly tech blogger writing for everyday people who are curious about AI but not tech experts. Compile these {len(cluster)} related AI articles into ONE easy-to-read, engaging article.
+
+{sources_text}
+
+🎯 CRITICAL REQUIREMENTS:
+
+WRITING STYLE:
+1. Write in SIMPLE, CASUAL language - like explaining to a friend
+2. Use SHORT SENTENCES - easy to scan on mobile
+3. AVOID JARGON - or explain it simply in parentheses
+4. Use REAL-WORLD EXAMPLES and ANALOGIES
+5. Make it FUN and ENGAGING - not boring or academic
+6. Use active voice, not passive
+7. Address readers directly ("you", "we")
+
+CONTENT STRUCTURE:
+1. Catchy, clear title (no clickbait, but interesting)
+2. Hook opening: "Why should I care about this?"
+3. Clear sections with descriptive subheadings
+4. Key facts highlighted with bullet points
+5. "What this means for you" sections
+6. Brief, satisfying conclusion
+
+EXAMPLES TO FOLLOW:
+❌ Bad: "The implementation of advanced neural architectures facilitates..."
+✅ Good: "New AI systems use smarter brain-like networks to..."
+
+❌ Bad: "Anthropomorphic large language models demonstrate emergent capabilities..."
+✅ Good: "ChatGPT-like AI is learning new tricks on its own..."
+
+TARGET: Myanmar general public (will be translated to Burmese)
+LENGTH: {config.PIPELINE['min_article_length']}-{config.PIPELINE['max_article_length']} words (shorter is better!)
+
+Format the output as:
+TITLE: [Engaging, clear title]
+
+EXCERPT: [2-sentence casual summary that makes people want to read]
+
+CONTENT:
+[Your easy-to-read article with markdown formatting]
+
+SOURCES: [List of original URLs]
+"""
+        
+        try:
+            message = self.client.messages.create(
+                model=config.TRANSLATION['model'],
+                max_tokens=config.TRANSLATION['max_tokens'],
+                temperature=0.5,  # Slightly higher for creative writing
+                messages=[{"role": "user", "content": prompt}]
+            )
+            
+            response = message.content[0].text
+            
+            # Parse response
+            compiled = self.parse_compiled_article(response, cluster)
+            return compiled
+        
+        except Exception as e:
+            logger.error(f"Error compiling with Claude: {e}")
+            return None
+    
+    def enhance_single_article(self, article: Dict) -> Dict:
+        """Enhance a single article (format, clean up, add structure)"""
+        return {
+            'title': article['title'],
+            'content': article['content'],
+            'excerpt': article['content'][:200] + '...',
+            'source_articles': [
+                {
+                    'url': article['url'],
+                    'title': article['title'],
+                    'author': article['author']
+                }
+            ],
+            'category_hint': article.get('category_hint'),
+            'featured_image': article.get('top_image')
+        }
+    
+    def parse_compiled_article(self, response: str, cluster: List[Dict]) -> Dict:
+        """Parse Claude's response into structured article"""
+        lines = response.strip().split('\n')
+        
+        title = ""
+        excerpt = ""
+        content = ""
+        
+        current_section = None
+        
+        for line in lines:
+            if line.startswith('TITLE:'):
+                title = line.replace('TITLE:', '').strip()
+                current_section = 'title'
+            elif line.startswith('EXCERPT:'):
+                excerpt = line.replace('EXCERPT:', '').strip()
+                current_section = 'excerpt'
+            elif line.startswith('CONTENT:'):
+                current_section = 'content'
+            elif line.startswith('SOURCES:'):
+                current_section = 'sources'
+            elif current_section == 'content':
+                content += line + '\n'
+        
+        # Fallback if parsing fails
+        if not title:
+            title = cluster[0]['title']
+        if not excerpt:
+            excerpt = content[:200] + '...' if content else cluster[0]['content'][:200] + '...'
+        if not content:
+            content = response
+        
+        # Build source articles list
+        source_articles = [
+            {
+                'url': article['url'],
+                'title': article['title'],
+                'author': article['author']
+            }
+            for article in cluster
+        ]
+        
+        # Collect all images from cluster
+        all_images = []
+        for article in cluster:
+            if article.get('images'):
+                all_images.extend(article['images'])
+            elif article.get('top_image'):
+                all_images.append(article['top_image'])
+        
+        # Remove duplicates, keep first 5
+        unique_images = []
+        for img in all_images:
+            if img and img not in unique_images:
+                unique_images.append(img)
+                if len(unique_images) >= 5:
+                    break
+        
+        # Collect all videos from cluster
+        all_videos = []
+        for article in cluster:
+            if article.get('videos'):
+                all_videos.extend(article['videos'])
+        
+        # Remove duplicates
+        unique_videos = list(set([v for v in all_videos if v]))[:3]  # Max 3 videos
+        
+        # Detect category
+        category_hint = cluster[0].get('category_hint') or database.detect_category(title, content)
+        
+        return {
+            'title': title.strip(),
+            'content': content.strip(),
+            'excerpt': excerpt.strip(),
+            'source_articles': source_articles,
+            'category_hint': category_hint,
+            'featured_image': unique_images[0] if unique_images else None,
+            'images': unique_images,  # 🔥 All images
+            'videos': unique_videos   # 🔥 All videos
+        }
+
+def run_compiler():
+    """Main compiler execution"""
+    logger.info("Starting compiler...")
+    start_time = time.time()
+    
+    try:
+        compiler = ArticleCompiler()
+        compiled_articles = compiler.compile_articles()
+        
+        duration = int(time.time() - start_time)
+        database.log_pipeline_stage(
+            stage='compile',
+            status='completed',
+            articles_processed=len(compiled_articles),
+            duration=duration
+        )
+        
+        logger.info(f"Compiler completed in {duration}s. Articles compiled: {len(compiled_articles)}")
+        return compiled_articles
+    
+    except Exception as e:
+        logger.error(f"Compiler failed: {e}")
+        database.log_pipeline_stage(
+            stage='compile',
+            status='failed',
+            error_message=str(e)
+        )
+        return []
+
+if __name__ == '__main__':
+    from loguru import logger
+    logger.add(config.LOG_FILE, rotation="1 day")
+    compiled = run_compiler()
+    print(f"Compiled {len(compiled)} articles")