forked from minzeyaphyo/burmddit
330 lines
10 KiB
Python
330 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Burmddit Quality Control System
|
|
Automatically checks article quality and takes corrective actions
|
|
"""
|
|
|
|
import psycopg2
|
|
from dotenv import load_dotenv
|
|
import os
|
|
from loguru import logger
|
|
import re
|
|
from datetime import datetime, timedelta
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
load_dotenv()
|
|
|
|
class QualityControl:
|
|
def __init__(self):
|
|
self.conn = psycopg2.connect(os.getenv('DATABASE_URL'))
|
|
self.issues_found = []
|
|
|
|
def run_all_checks(self):
|
|
"""Run all quality checks"""
|
|
logger.info("🔍 Starting Quality Control Checks...")
|
|
|
|
self.check_missing_images()
|
|
self.check_translation_quality()
|
|
self.check_content_length()
|
|
self.check_duplicate_content()
|
|
self.check_broken_slugs()
|
|
|
|
return self.generate_report()
|
|
|
|
def check_missing_images(self):
|
|
"""Check for articles without images"""
|
|
logger.info("📸 Checking for missing images...")
|
|
|
|
cur = self.conn.cursor()
|
|
cur.execute("""
|
|
SELECT id, slug, title_burmese, featured_image
|
|
FROM articles
|
|
WHERE status = 'published'
|
|
AND (featured_image IS NULL OR featured_image = '' OR featured_image = '/placeholder.jpg')
|
|
""")
|
|
|
|
articles = cur.fetchall()
|
|
|
|
if articles:
|
|
logger.warning(f"Found {len(articles)} articles without images")
|
|
self.issues_found.append({
|
|
'type': 'missing_images',
|
|
'count': len(articles),
|
|
'action': 'set_placeholder',
|
|
'articles': [{'id': a[0], 'slug': a[1]} for a in articles]
|
|
})
|
|
|
|
# Action: Set default AI-related placeholder image
|
|
self.fix_missing_images(articles)
|
|
|
|
cur.close()
|
|
|
|
def fix_missing_images(self, articles):
|
|
"""Fix articles with missing images"""
|
|
cur = self.conn.cursor()
|
|
|
|
# Use a default AI-themed image URL
|
|
default_image = 'https://images.unsplash.com/photo-1677442136019-21780ecad995?w=1200&h=630&fit=crop'
|
|
|
|
for article in articles:
|
|
article_id = article[0]
|
|
cur.execute("""
|
|
UPDATE articles
|
|
SET featured_image = %s
|
|
WHERE id = %s
|
|
""", (default_image, article_id))
|
|
|
|
self.conn.commit()
|
|
logger.info(f"✅ Fixed {len(articles)} articles with placeholder image")
|
|
cur.close()
|
|
|
|
def check_translation_quality(self):
|
|
"""Check for translation issues"""
|
|
logger.info("🔤 Checking translation quality...")
|
|
|
|
cur = self.conn.cursor()
|
|
|
|
# Check 1: Very short content (likely failed translation)
|
|
cur.execute("""
|
|
SELECT id, slug, title_burmese, LENGTH(content_burmese) as len
|
|
FROM articles
|
|
WHERE status = 'published'
|
|
AND LENGTH(content_burmese) < 500
|
|
""")
|
|
short_articles = cur.fetchall()
|
|
|
|
# Check 2: Repeated text patterns (translation loops)
|
|
cur.execute("""
|
|
SELECT id, slug, title_burmese, content_burmese
|
|
FROM articles
|
|
WHERE status = 'published'
|
|
AND content_burmese ~ '(.{50,})\\1{2,}'
|
|
""")
|
|
repeated_articles = cur.fetchall()
|
|
|
|
# Check 3: Contains untranslated English blocks
|
|
cur.execute("""
|
|
SELECT id, slug, title_burmese
|
|
FROM articles
|
|
WHERE status = 'published'
|
|
AND content_burmese ~ '[a-zA-Z]{100,}'
|
|
""")
|
|
english_articles = cur.fetchall()
|
|
|
|
problem_articles = []
|
|
|
|
if short_articles:
|
|
logger.warning(f"Found {len(short_articles)} articles with short content")
|
|
problem_articles.extend([a[0] for a in short_articles])
|
|
|
|
if repeated_articles:
|
|
logger.warning(f"Found {len(repeated_articles)} articles with repeated text")
|
|
problem_articles.extend([a[0] for a in repeated_articles])
|
|
|
|
if english_articles:
|
|
logger.warning(f"Found {len(english_articles)} articles with untranslated English")
|
|
problem_articles.extend([a[0] for a in english_articles])
|
|
|
|
if problem_articles:
|
|
# Remove duplicates
|
|
problem_articles = list(set(problem_articles))
|
|
|
|
self.issues_found.append({
|
|
'type': 'translation_quality',
|
|
'count': len(problem_articles),
|
|
'action': 'archive',
|
|
'articles': problem_articles
|
|
})
|
|
|
|
# Action: Archive broken articles
|
|
self.archive_broken_articles(problem_articles)
|
|
|
|
cur.close()
|
|
|
|
def archive_broken_articles(self, article_ids):
|
|
"""Archive articles with quality issues"""
|
|
cur = self.conn.cursor()
|
|
|
|
for article_id in article_ids:
|
|
cur.execute("""
|
|
UPDATE articles
|
|
SET status = 'archived'
|
|
WHERE id = %s
|
|
""", (article_id,))
|
|
|
|
self.conn.commit()
|
|
logger.info(f"✅ Archived {len(article_ids)} broken articles")
|
|
cur.close()
|
|
|
|
def check_content_length(self):
|
|
"""Check if content meets length requirements"""
|
|
logger.info("📏 Checking content length...")
|
|
|
|
cur = self.conn.cursor()
|
|
cur.execute("""
|
|
SELECT COUNT(*)
|
|
FROM articles
|
|
WHERE status = 'published'
|
|
AND (
|
|
LENGTH(content_burmese) < 600
|
|
OR LENGTH(content_burmese) > 3000
|
|
)
|
|
""")
|
|
|
|
count = cur.fetchone()[0]
|
|
|
|
if count > 0:
|
|
logger.warning(f"Found {count} articles with length issues")
|
|
self.issues_found.append({
|
|
'type': 'content_length',
|
|
'count': count,
|
|
'action': 'review_needed'
|
|
})
|
|
|
|
cur.close()
|
|
|
|
def check_duplicate_content(self):
|
|
"""Check for duplicate articles"""
|
|
logger.info("🔁 Checking for duplicates...")
|
|
|
|
cur = self.conn.cursor()
|
|
cur.execute("""
|
|
SELECT title_burmese, COUNT(*) as cnt
|
|
FROM articles
|
|
WHERE status = 'published'
|
|
GROUP BY title_burmese
|
|
HAVING COUNT(*) > 1
|
|
""")
|
|
|
|
duplicates = cur.fetchall()
|
|
|
|
if duplicates:
|
|
logger.warning(f"Found {len(duplicates)} duplicate titles")
|
|
self.issues_found.append({
|
|
'type': 'duplicates',
|
|
'count': len(duplicates),
|
|
'action': 'manual_review'
|
|
})
|
|
|
|
cur.close()
|
|
|
|
def check_broken_slugs(self):
|
|
"""Check for invalid slugs"""
|
|
logger.info("🔗 Checking slugs...")
|
|
|
|
cur = self.conn.cursor()
|
|
cur.execute("""
|
|
SELECT id, slug
|
|
FROM articles
|
|
WHERE status = 'published'
|
|
AND (
|
|
slug IS NULL
|
|
OR slug = ''
|
|
OR LENGTH(slug) > 200
|
|
OR slug ~ '[^a-z0-9-]'
|
|
)
|
|
""")
|
|
|
|
broken = cur.fetchall()
|
|
|
|
if broken:
|
|
logger.warning(f"Found {len(broken)} articles with invalid slugs")
|
|
self.issues_found.append({
|
|
'type': 'broken_slugs',
|
|
'count': len(broken),
|
|
'action': 'regenerate_slugs'
|
|
})
|
|
|
|
cur.close()
|
|
|
|
def generate_report(self):
|
|
"""Generate quality control report"""
|
|
report = {
|
|
'timestamp': datetime.now().isoformat(),
|
|
'total_issues': len(self.issues_found),
|
|
'issues': self.issues_found,
|
|
'summary': {}
|
|
}
|
|
|
|
# Count by type
|
|
for issue in self.issues_found:
|
|
issue_type = issue['type']
|
|
report['summary'][issue_type] = issue['count']
|
|
|
|
logger.info("=" * 80)
|
|
logger.info("📊 QUALITY CONTROL REPORT")
|
|
logger.info("=" * 80)
|
|
logger.info(f"Total Issues Found: {len(self.issues_found)}")
|
|
|
|
for issue in self.issues_found:
|
|
logger.info(f" • {issue['type']}: {issue['count']} articles → {issue['action']}")
|
|
|
|
logger.info("=" * 80)
|
|
|
|
return report
|
|
|
|
def get_article_stats(self):
|
|
"""Get overall article statistics"""
|
|
cur = self.conn.cursor()
|
|
|
|
cur.execute("SELECT COUNT(*) FROM articles WHERE status = 'published'")
|
|
total = cur.fetchone()[0]
|
|
|
|
cur.execute("SELECT COUNT(*) FROM articles WHERE status = 'archived'")
|
|
archived = cur.fetchone()[0]
|
|
|
|
cur.execute("SELECT COUNT(*) FROM articles WHERE status = 'draft'")
|
|
draft = cur.fetchone()[0]
|
|
|
|
cur.execute("""
|
|
SELECT COUNT(*) FROM articles
|
|
WHERE status = 'published'
|
|
AND featured_image IS NOT NULL
|
|
AND featured_image != ''
|
|
""")
|
|
with_images = cur.fetchone()[0]
|
|
|
|
stats = {
|
|
'total_published': total,
|
|
'total_archived': archived,
|
|
'total_draft': draft,
|
|
'with_images': with_images,
|
|
'without_images': total - with_images
|
|
}
|
|
|
|
cur.close()
|
|
return stats
|
|
|
|
def close(self):
|
|
"""Close database connection"""
|
|
self.conn.close()
|
|
|
|
|
|
def main():
|
|
"""Run quality control"""
|
|
qc = QualityControl()
|
|
|
|
# Get stats before
|
|
logger.info("📊 Statistics Before Quality Control:")
|
|
stats_before = qc.get_article_stats()
|
|
for key, value in stats_before.items():
|
|
logger.info(f" {key}: {value}")
|
|
|
|
# Run checks
|
|
report = qc.run_all_checks()
|
|
|
|
# Get stats after
|
|
logger.info("\n📊 Statistics After Quality Control:")
|
|
stats_after = qc.get_article_stats()
|
|
for key, value in stats_after.items():
|
|
logger.info(f" {key}: {value}")
|
|
|
|
qc.close()
|
|
|
|
return report
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|