forked from minzeyaphyo/burmddit
✅ Trigger redeploy: Category pages + Quality control
This commit is contained in:
329
backend/quality_control.py
Normal file
329
backend/quality_control.py
Normal file
@@ -0,0 +1,329 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Burmddit Quality Control System
|
||||
Automatically checks article quality and takes corrective actions
|
||||
"""
|
||||
|
||||
import psycopg2
|
||||
from dotenv import load_dotenv
|
||||
import os
|
||||
from loguru import logger
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
load_dotenv()
|
||||
|
||||
class QualityControl:
|
||||
def __init__(self):
|
||||
self.conn = psycopg2.connect(os.getenv('DATABASE_URL'))
|
||||
self.issues_found = []
|
||||
|
||||
def run_all_checks(self):
|
||||
"""Run all quality checks"""
|
||||
logger.info("🔍 Starting Quality Control Checks...")
|
||||
|
||||
self.check_missing_images()
|
||||
self.check_translation_quality()
|
||||
self.check_content_length()
|
||||
self.check_duplicate_content()
|
||||
self.check_broken_slugs()
|
||||
|
||||
return self.generate_report()
|
||||
|
||||
def check_missing_images(self):
|
||||
"""Check for articles without images"""
|
||||
logger.info("📸 Checking for missing images...")
|
||||
|
||||
cur = self.conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT id, slug, title_burmese, featured_image
|
||||
FROM articles
|
||||
WHERE status = 'published'
|
||||
AND (featured_image IS NULL OR featured_image = '' OR featured_image = '/placeholder.jpg')
|
||||
""")
|
||||
|
||||
articles = cur.fetchall()
|
||||
|
||||
if articles:
|
||||
logger.warning(f"Found {len(articles)} articles without images")
|
||||
self.issues_found.append({
|
||||
'type': 'missing_images',
|
||||
'count': len(articles),
|
||||
'action': 'set_placeholder',
|
||||
'articles': [{'id': a[0], 'slug': a[1]} for a in articles]
|
||||
})
|
||||
|
||||
# Action: Set default AI-related placeholder image
|
||||
self.fix_missing_images(articles)
|
||||
|
||||
cur.close()
|
||||
|
||||
def fix_missing_images(self, articles):
|
||||
"""Fix articles with missing images"""
|
||||
cur = self.conn.cursor()
|
||||
|
||||
# Use a default AI-themed image URL
|
||||
default_image = 'https://images.unsplash.com/photo-1677442136019-21780ecad995?w=1200&h=630&fit=crop'
|
||||
|
||||
for article in articles:
|
||||
article_id = article[0]
|
||||
cur.execute("""
|
||||
UPDATE articles
|
||||
SET featured_image = %s
|
||||
WHERE id = %s
|
||||
""", (default_image, article_id))
|
||||
|
||||
self.conn.commit()
|
||||
logger.info(f"✅ Fixed {len(articles)} articles with placeholder image")
|
||||
cur.close()
|
||||
|
||||
def check_translation_quality(self):
|
||||
"""Check for translation issues"""
|
||||
logger.info("🔤 Checking translation quality...")
|
||||
|
||||
cur = self.conn.cursor()
|
||||
|
||||
# Check 1: Very short content (likely failed translation)
|
||||
cur.execute("""
|
||||
SELECT id, slug, title_burmese, LENGTH(content_burmese) as len
|
||||
FROM articles
|
||||
WHERE status = 'published'
|
||||
AND LENGTH(content_burmese) < 500
|
||||
""")
|
||||
short_articles = cur.fetchall()
|
||||
|
||||
# Check 2: Repeated text patterns (translation loops)
|
||||
cur.execute("""
|
||||
SELECT id, slug, title_burmese, content_burmese
|
||||
FROM articles
|
||||
WHERE status = 'published'
|
||||
AND content_burmese ~ '(.{50,})\\1{2,}'
|
||||
""")
|
||||
repeated_articles = cur.fetchall()
|
||||
|
||||
# Check 3: Contains untranslated English blocks
|
||||
cur.execute("""
|
||||
SELECT id, slug, title_burmese
|
||||
FROM articles
|
||||
WHERE status = 'published'
|
||||
AND content_burmese ~ '[a-zA-Z]{100,}'
|
||||
""")
|
||||
english_articles = cur.fetchall()
|
||||
|
||||
problem_articles = []
|
||||
|
||||
if short_articles:
|
||||
logger.warning(f"Found {len(short_articles)} articles with short content")
|
||||
problem_articles.extend([a[0] for a in short_articles])
|
||||
|
||||
if repeated_articles:
|
||||
logger.warning(f"Found {len(repeated_articles)} articles with repeated text")
|
||||
problem_articles.extend([a[0] for a in repeated_articles])
|
||||
|
||||
if english_articles:
|
||||
logger.warning(f"Found {len(english_articles)} articles with untranslated English")
|
||||
problem_articles.extend([a[0] for a in english_articles])
|
||||
|
||||
if problem_articles:
|
||||
# Remove duplicates
|
||||
problem_articles = list(set(problem_articles))
|
||||
|
||||
self.issues_found.append({
|
||||
'type': 'translation_quality',
|
||||
'count': len(problem_articles),
|
||||
'action': 'archive',
|
||||
'articles': problem_articles
|
||||
})
|
||||
|
||||
# Action: Archive broken articles
|
||||
self.archive_broken_articles(problem_articles)
|
||||
|
||||
cur.close()
|
||||
|
||||
def archive_broken_articles(self, article_ids):
|
||||
"""Archive articles with quality issues"""
|
||||
cur = self.conn.cursor()
|
||||
|
||||
for article_id in article_ids:
|
||||
cur.execute("""
|
||||
UPDATE articles
|
||||
SET status = 'archived'
|
||||
WHERE id = %s
|
||||
""", (article_id,))
|
||||
|
||||
self.conn.commit()
|
||||
logger.info(f"✅ Archived {len(article_ids)} broken articles")
|
||||
cur.close()
|
||||
|
||||
def check_content_length(self):
|
||||
"""Check if content meets length requirements"""
|
||||
logger.info("📏 Checking content length...")
|
||||
|
||||
cur = self.conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT COUNT(*)
|
||||
FROM articles
|
||||
WHERE status = 'published'
|
||||
AND (
|
||||
LENGTH(content_burmese) < 600
|
||||
OR LENGTH(content_burmese) > 3000
|
||||
)
|
||||
""")
|
||||
|
||||
count = cur.fetchone()[0]
|
||||
|
||||
if count > 0:
|
||||
logger.warning(f"Found {count} articles with length issues")
|
||||
self.issues_found.append({
|
||||
'type': 'content_length',
|
||||
'count': count,
|
||||
'action': 'review_needed'
|
||||
})
|
||||
|
||||
cur.close()
|
||||
|
||||
def check_duplicate_content(self):
|
||||
"""Check for duplicate articles"""
|
||||
logger.info("🔁 Checking for duplicates...")
|
||||
|
||||
cur = self.conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT title_burmese, COUNT(*) as cnt
|
||||
FROM articles
|
||||
WHERE status = 'published'
|
||||
GROUP BY title_burmese
|
||||
HAVING COUNT(*) > 1
|
||||
""")
|
||||
|
||||
duplicates = cur.fetchall()
|
||||
|
||||
if duplicates:
|
||||
logger.warning(f"Found {len(duplicates)} duplicate titles")
|
||||
self.issues_found.append({
|
||||
'type': 'duplicates',
|
||||
'count': len(duplicates),
|
||||
'action': 'manual_review'
|
||||
})
|
||||
|
||||
cur.close()
|
||||
|
||||
def check_broken_slugs(self):
|
||||
"""Check for invalid slugs"""
|
||||
logger.info("🔗 Checking slugs...")
|
||||
|
||||
cur = self.conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT id, slug
|
||||
FROM articles
|
||||
WHERE status = 'published'
|
||||
AND (
|
||||
slug IS NULL
|
||||
OR slug = ''
|
||||
OR LENGTH(slug) > 200
|
||||
OR slug ~ '[^a-z0-9-]'
|
||||
)
|
||||
""")
|
||||
|
||||
broken = cur.fetchall()
|
||||
|
||||
if broken:
|
||||
logger.warning(f"Found {len(broken)} articles with invalid slugs")
|
||||
self.issues_found.append({
|
||||
'type': 'broken_slugs',
|
||||
'count': len(broken),
|
||||
'action': 'regenerate_slugs'
|
||||
})
|
||||
|
||||
cur.close()
|
||||
|
||||
def generate_report(self):
|
||||
"""Generate quality control report"""
|
||||
report = {
|
||||
'timestamp': datetime.now().isoformat(),
|
||||
'total_issues': len(self.issues_found),
|
||||
'issues': self.issues_found,
|
||||
'summary': {}
|
||||
}
|
||||
|
||||
# Count by type
|
||||
for issue in self.issues_found:
|
||||
issue_type = issue['type']
|
||||
report['summary'][issue_type] = issue['count']
|
||||
|
||||
logger.info("=" * 80)
|
||||
logger.info("📊 QUALITY CONTROL REPORT")
|
||||
logger.info("=" * 80)
|
||||
logger.info(f"Total Issues Found: {len(self.issues_found)}")
|
||||
|
||||
for issue in self.issues_found:
|
||||
logger.info(f" • {issue['type']}: {issue['count']} articles → {issue['action']}")
|
||||
|
||||
logger.info("=" * 80)
|
||||
|
||||
return report
|
||||
|
||||
def get_article_stats(self):
|
||||
"""Get overall article statistics"""
|
||||
cur = self.conn.cursor()
|
||||
|
||||
cur.execute("SELECT COUNT(*) FROM articles WHERE status = 'published'")
|
||||
total = cur.fetchone()[0]
|
||||
|
||||
cur.execute("SELECT COUNT(*) FROM articles WHERE status = 'archived'")
|
||||
archived = cur.fetchone()[0]
|
||||
|
||||
cur.execute("SELECT COUNT(*) FROM articles WHERE status = 'draft'")
|
||||
draft = cur.fetchone()[0]
|
||||
|
||||
cur.execute("""
|
||||
SELECT COUNT(*) FROM articles
|
||||
WHERE status = 'published'
|
||||
AND featured_image IS NOT NULL
|
||||
AND featured_image != ''
|
||||
""")
|
||||
with_images = cur.fetchone()[0]
|
||||
|
||||
stats = {
|
||||
'total_published': total,
|
||||
'total_archived': archived,
|
||||
'total_draft': draft,
|
||||
'with_images': with_images,
|
||||
'without_images': total - with_images
|
||||
}
|
||||
|
||||
cur.close()
|
||||
return stats
|
||||
|
||||
def close(self):
|
||||
"""Close database connection"""
|
||||
self.conn.close()
|
||||
|
||||
|
||||
def main():
|
||||
"""Run quality control"""
|
||||
qc = QualityControl()
|
||||
|
||||
# Get stats before
|
||||
logger.info("📊 Statistics Before Quality Control:")
|
||||
stats_before = qc.get_article_stats()
|
||||
for key, value in stats_before.items():
|
||||
logger.info(f" {key}: {value}")
|
||||
|
||||
# Run checks
|
||||
report = qc.run_all_checks()
|
||||
|
||||
# Get stats after
|
||||
logger.info("\n📊 Statistics After Quality Control:")
|
||||
stats_after = qc.get_article_stats()
|
||||
for key, value in stats_after.items():
|
||||
logger.info(f" {key}: {value}")
|
||||
|
||||
qc.close()
|
||||
|
||||
return report
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user