#!/usr/bin/env python3 """ Burmddit Quality Control System Automatically checks article quality and takes corrective actions """ import psycopg2 from dotenv import load_dotenv import os from loguru import logger import re from datetime import datetime, timedelta import requests from bs4 import BeautifulSoup load_dotenv() class QualityControl: def __init__(self): self.conn = psycopg2.connect(os.getenv('DATABASE_URL')) self.issues_found = [] def run_all_checks(self): """Run all quality checks""" logger.info("šŸ” Starting Quality Control Checks...") self.check_missing_images() self.check_translation_quality() self.check_content_length() self.check_duplicate_content() self.check_broken_slugs() return self.generate_report() def check_missing_images(self): """Check for articles without images""" logger.info("šŸ“ø Checking for missing images...") cur = self.conn.cursor() cur.execute(""" SELECT id, slug, title_burmese, featured_image FROM articles WHERE status = 'published' AND (featured_image IS NULL OR featured_image = '' OR featured_image = '/placeholder.jpg') """) articles = cur.fetchall() if articles: logger.warning(f"Found {len(articles)} articles without images") self.issues_found.append({ 'type': 'missing_images', 'count': len(articles), 'action': 'set_placeholder', 'articles': [{'id': a[0], 'slug': a[1]} for a in articles] }) # Action: Set default AI-related placeholder image self.fix_missing_images(articles) cur.close() def fix_missing_images(self, articles): """Fix articles with missing images""" cur = self.conn.cursor() # Use a default AI-themed image URL default_image = 'https://images.unsplash.com/photo-1677442136019-21780ecad995?w=1200&h=630&fit=crop' for article in articles: article_id = article[0] cur.execute(""" UPDATE articles SET featured_image = %s WHERE id = %s """, (default_image, article_id)) self.conn.commit() logger.info(f"āœ… Fixed {len(articles)} articles with placeholder image") cur.close() def check_translation_quality(self): """Check for translation issues""" logger.info("šŸ”¤ Checking translation quality...") cur = self.conn.cursor() # Check 1: Very short content (likely failed translation) cur.execute(""" SELECT id, slug, title_burmese, LENGTH(content_burmese) as len FROM articles WHERE status = 'published' AND LENGTH(content_burmese) < 500 """) short_articles = cur.fetchall() # Check 2: Repeated text patterns (translation loops) cur.execute(""" SELECT id, slug, title_burmese, content_burmese FROM articles WHERE status = 'published' AND content_burmese ~ '(.{50,})\\1{2,}' """) repeated_articles = cur.fetchall() # Check 3: Contains untranslated English blocks cur.execute(""" SELECT id, slug, title_burmese FROM articles WHERE status = 'published' AND content_burmese ~ '[a-zA-Z]{100,}' """) english_articles = cur.fetchall() problem_articles = [] if short_articles: logger.warning(f"Found {len(short_articles)} articles with short content") problem_articles.extend([a[0] for a in short_articles]) if repeated_articles: logger.warning(f"Found {len(repeated_articles)} articles with repeated text") problem_articles.extend([a[0] for a in repeated_articles]) if english_articles: logger.warning(f"Found {len(english_articles)} articles with untranslated English") problem_articles.extend([a[0] for a in english_articles]) if problem_articles: # Remove duplicates problem_articles = list(set(problem_articles)) self.issues_found.append({ 'type': 'translation_quality', 'count': len(problem_articles), 'action': 'archive', 'articles': problem_articles }) # Action: Archive broken articles self.archive_broken_articles(problem_articles) cur.close() def archive_broken_articles(self, article_ids): """Archive articles with quality issues""" cur = self.conn.cursor() for article_id in article_ids: cur.execute(""" UPDATE articles SET status = 'archived' WHERE id = %s """, (article_id,)) self.conn.commit() logger.info(f"āœ… Archived {len(article_ids)} broken articles") cur.close() def check_content_length(self): """Check if content meets length requirements""" logger.info("šŸ“ Checking content length...") cur = self.conn.cursor() cur.execute(""" SELECT COUNT(*) FROM articles WHERE status = 'published' AND ( LENGTH(content_burmese) < 600 OR LENGTH(content_burmese) > 3000 ) """) count = cur.fetchone()[0] if count > 0: logger.warning(f"Found {count} articles with length issues") self.issues_found.append({ 'type': 'content_length', 'count': count, 'action': 'review_needed' }) cur.close() def check_duplicate_content(self): """Check for duplicate articles""" logger.info("šŸ” Checking for duplicates...") cur = self.conn.cursor() cur.execute(""" SELECT title_burmese, COUNT(*) as cnt FROM articles WHERE status = 'published' GROUP BY title_burmese HAVING COUNT(*) > 1 """) duplicates = cur.fetchall() if duplicates: logger.warning(f"Found {len(duplicates)} duplicate titles") self.issues_found.append({ 'type': 'duplicates', 'count': len(duplicates), 'action': 'manual_review' }) cur.close() def check_broken_slugs(self): """Check for invalid slugs""" logger.info("šŸ”— Checking slugs...") cur = self.conn.cursor() cur.execute(""" SELECT id, slug FROM articles WHERE status = 'published' AND ( slug IS NULL OR slug = '' OR LENGTH(slug) > 200 OR slug ~ '[^a-z0-9-]' ) """) broken = cur.fetchall() if broken: logger.warning(f"Found {len(broken)} articles with invalid slugs") self.issues_found.append({ 'type': 'broken_slugs', 'count': len(broken), 'action': 'regenerate_slugs' }) cur.close() def generate_report(self): """Generate quality control report""" report = { 'timestamp': datetime.now().isoformat(), 'total_issues': len(self.issues_found), 'issues': self.issues_found, 'summary': {} } # Count by type for issue in self.issues_found: issue_type = issue['type'] report['summary'][issue_type] = issue['count'] logger.info("=" * 80) logger.info("šŸ“Š QUALITY CONTROL REPORT") logger.info("=" * 80) logger.info(f"Total Issues Found: {len(self.issues_found)}") for issue in self.issues_found: logger.info(f" • {issue['type']}: {issue['count']} articles → {issue['action']}") logger.info("=" * 80) return report def get_article_stats(self): """Get overall article statistics""" cur = self.conn.cursor() cur.execute("SELECT COUNT(*) FROM articles WHERE status = 'published'") total = cur.fetchone()[0] cur.execute("SELECT COUNT(*) FROM articles WHERE status = 'archived'") archived = cur.fetchone()[0] cur.execute("SELECT COUNT(*) FROM articles WHERE status = 'draft'") draft = cur.fetchone()[0] cur.execute(""" SELECT COUNT(*) FROM articles WHERE status = 'published' AND featured_image IS NOT NULL AND featured_image != '' """) with_images = cur.fetchone()[0] stats = { 'total_published': total, 'total_archived': archived, 'total_draft': draft, 'with_images': with_images, 'without_images': total - with_images } cur.close() return stats def close(self): """Close database connection""" self.conn.close() def main(): """Run quality control""" qc = QualityControl() # Get stats before logger.info("šŸ“Š Statistics Before Quality Control:") stats_before = qc.get_article_stats() for key, value in stats_before.items(): logger.info(f" {key}: {value}") # Run checks report = qc.run_all_checks() # Get stats after logger.info("\nšŸ“Š Statistics After Quality Control:") stats_after = qc.get_article_stats() for key, value in stats_after.items(): logger.info(f" {key}: {value}") qc.close() return report if __name__ == "__main__": main()