burmddit/backend/compiler.py

# Article compilation module - Groups and merges related articles

from typing import List, Dict, Tuple, Optional
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from loguru import logger
import anthropic
import config
import database
import time

class ArticleCompiler:
    def __init__(self):
        self.client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)

    def compile_articles(self, num_articles: int = None) -> List[Dict]:
        """Main compilation pipeline"""
        if num_articles is None:
            num_articles = config.PIPELINE['articles_per_day']

        # Get unprocessed articles from database
        raw_articles = database.get_unprocessed_articles(limit=100)

        if not raw_articles:
            logger.warning("No unprocessed articles found")
            return []

        logger.info(f"Processing {len(raw_articles)} raw articles")

        # Cluster similar articles
        clusters = self.cluster_articles(raw_articles, num_clusters=num_articles)

        # Compile each cluster into one comprehensive article
        compiled_articles = []
        for i, cluster in enumerate(clusters):
            try:
                logger.info(f"Compiling cluster {i+1}/{len(clusters)} with {len(cluster)} articles")
                compiled = self.compile_cluster(cluster)

                if compiled:
                    compiled_articles.append(compiled)

                time.sleep(1)  # Rate limiting

            except Exception as e:
                logger.error(f"Error compiling cluster {i+1}: {e}")
                continue

        logger.info(f"Compiled {len(compiled_articles)} articles")
        return compiled_articles

    def cluster_articles(self, articles: List[Dict], num_clusters: int) -> List[List[Dict]]:
        """Cluster articles by similarity"""
        if len(articles) <= num_clusters:
            return [[article] for article in articles]

        # Extract text for vectorization
        texts = [
            f"{article['title']} {article['content'][:500]}"
            for article in articles
        ]

        # TF-IDF vectorization
        vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
        tfidf_matrix = vectorizer.fit_transform(texts)

        # Calculate similarity matrix
        similarity_matrix = cosine_similarity(tfidf_matrix)

        # Simple clustering: greedy approach
        # Find most similar articles and group them
        clusters = []
        used_indices = set()

        for i in range(len(articles)):
            if i in used_indices:
                continue

            # Find similar articles (above threshold)
            similar_indices = []
            for j in range(len(articles)):
                if j != i and j not in used_indices:
                    if similarity_matrix[i][j] >= config.PIPELINE['clustering_threshold']:
                        similar_indices.append(j)

            # Create cluster
            cluster = [articles[i]]
            for idx in similar_indices[:config.PIPELINE['sources_per_article']-1]:  # Limit cluster size
                cluster.append(articles[idx])
                used_indices.add(idx)

            clusters.append(cluster)
            used_indices.add(i)

            if len(clusters) >= num_clusters:
                break

        # If we don't have enough clusters, add remaining articles individually
        while len(clusters) < num_clusters and len(used_indices) < len(articles):
            for i, article in enumerate(articles):
                if i not in used_indices:
                    clusters.append([article])
                    used_indices.add(i)
                    break

        logger.info(f"Created {len(clusters)} clusters from {len(articles)} articles")
        return clusters

    def compile_cluster(self, cluster: List[Dict]) -> Optional[Dict]:
        """Compile multiple articles into one comprehensive piece"""
        if not cluster:
            return None

        # If only one article, use it directly (with some enhancement)
        if len(cluster) == 1:
            return self.enhance_single_article(cluster[0])

        # Prepare source summaries
        sources_text = ""
        for i, article in enumerate(cluster, 1):
            sources_text += f"\n\n## Source {i}: {article['title']}\n"
            sources_text += f"URL: {article['url']}\n"
            sources_text += f"Content: {article['content'][:1000]}...\n"  # First 1000 chars

        # Use Claude to compile articles
        prompt = f"""You are a friendly tech blogger writing for everyday people who are curious about AI but not tech experts. Compile these {len(cluster)} related AI articles into ONE easy-to-read, engaging article.

{sources_text}

🎯 CRITICAL REQUIREMENTS:

WRITING STYLE:
1. Write in SIMPLE, CASUAL language - like explaining to a friend
2. Use SHORT SENTENCES - easy to scan on mobile
3. AVOID JARGON - or explain it simply in parentheses
4. Use REAL-WORLD EXAMPLES and ANALOGIES
5. Make it FUN and ENGAGING - not boring or academic
6. Use active voice, not passive
7. Address readers directly ("you", "we")

CONTENT STRUCTURE:
1. Catchy, clear title (no clickbait, but interesting)
2. Hook opening: "Why should I care about this?"
3. Clear sections with descriptive subheadings
4. Key facts highlighted with bullet points
5. "What this means for you" sections
6. Brief, satisfying conclusion

EXAMPLES TO FOLLOW:
❌ Bad: "The implementation of advanced neural architectures facilitates..."
✅ Good: "New AI systems use smarter brain-like networks to..."

❌ Bad: "Anthropomorphic large language models demonstrate emergent capabilities..."
✅ Good: "ChatGPT-like AI is learning new tricks on its own..."

TARGET: Myanmar general public (will be translated to Burmese)
LENGTH: {config.PIPELINE['min_article_length']}-{config.PIPELINE['max_article_length']} words (shorter is better!)

Format the output as:
TITLE: [Engaging, clear title]

EXCERPT: [2-sentence casual summary that makes people want to read]

CONTENT:
[Your easy-to-read article with markdown formatting]

SOURCES: [List of original URLs]
"""

        try:
            message = self.client.messages.create(
                model=config.TRANSLATION['model'],
                max_tokens=config.TRANSLATION['max_tokens'],
                temperature=0.5,  # Slightly higher for creative writing
                messages=[{"role": "user", "content": prompt}]
            )

            response = message.content[0].text

            # Parse response
            compiled = self.parse_compiled_article(response, cluster)
            return compiled

        except Exception as e:
            logger.error(f"Error compiling with Claude: {e}")
            return None

    def enhance_single_article(self, article: Dict) -> Dict:
        """Enhance a single article (format, clean up, add structure)"""
        return {
            'title': article['title'],
            'content': article['content'],
            'excerpt': article['content'][:200] + '...',
            'source_articles': [
                {
                    'url': article['url'],
                    'title': article['title'],
                    'author': article['author']
                }
            ],
            'category_hint': article.get('category_hint'),
            'featured_image': article.get('top_image')
        }

    def parse_compiled_article(self, response: str, cluster: List[Dict]) -> Dict:
        """Parse Claude's response into structured article"""
        lines = response.strip().split('\n')

        title = ""
        excerpt = ""
        content = ""

        current_section = None

        for line in lines:
            if line.startswith('TITLE:'):
                title = line.replace('TITLE:', '').strip()
                current_section = 'title'
            elif line.startswith('EXCERPT:'):
                excerpt = line.replace('EXCERPT:', '').strip()
                current_section = 'excerpt'
            elif line.startswith('CONTENT:'):
                current_section = 'content'
            elif line.startswith('SOURCES:'):
                current_section = 'sources'
            elif current_section == 'content':
                content += line + '\n'

        # Fallback if parsing fails
        if not title:
            title = cluster[0]['title']
        if not excerpt:
            excerpt = content[:200] + '...' if content else cluster[0]['content'][:200] + '...'
        if not content:
            content = response

        # Build source articles list
        source_articles = [
            {
                'url': article['url'],
                'title': article['title'],
                'author': article['author']
            }
            for article in cluster
        ]

        # Collect all images from cluster
        all_images = []
        for article in cluster:
            if article.get('images'):
                all_images.extend(article['images'])
            elif article.get('top_image'):
                all_images.append(article['top_image'])

        # Remove duplicates, keep first 5
        unique_images = []
        for img in all_images:
            if img and img not in unique_images:
                unique_images.append(img)
                if len(unique_images) >= 5:
                    break

        # Collect all videos from cluster
        all_videos = []
        for article in cluster:
            if article.get('videos'):
                all_videos.extend(article['videos'])

        # Remove duplicates
        unique_videos = list(set([v for v in all_videos if v]))[:3]  # Max 3 videos

        # Detect category
        category_hint = cluster[0].get('category_hint') or database.detect_category(title, content)

        return {
            'title': title.strip(),
            'content': content.strip(),
            'excerpt': excerpt.strip(),
            'source_articles': source_articles,
            'category_hint': category_hint,
            'featured_image': unique_images[0] if unique_images else None,
            'images': unique_images,  # 🔥 All images
            'videos': unique_videos   # 🔥 All videos
        }

def run_compiler():
    """Main compiler execution"""
    logger.info("Starting compiler...")
    start_time = time.time()

    try:
        compiler = ArticleCompiler()
        compiled_articles = compiler.compile_articles()

        duration = int(time.time() - start_time)
        database.log_pipeline_stage(
            stage='compile',
            status='completed',
            articles_processed=len(compiled_articles),
            duration=duration
        )

        logger.info(f"Compiler completed in {duration}s. Articles compiled: {len(compiled_articles)}")
        return compiled_articles

    except Exception as e:
        logger.error(f"Compiler failed: {e}")
        database.log_pipeline_stage(
            stage='compile',
            status='failed',
            error_message=str(e)
        )
        return []

if __name__ == '__main__':
    from loguru import logger
    logger.add(config.LOG_FILE, rotation="1 day")
    compiled = run_compiler()
    print(f"Compiled {len(compiled)} articles")