forked from minzeyaphyo/burmddit
Frontend changes: - Add /admin dashboard for article management - Add AdminButton component (Alt+Shift+A on articles) - Add /api/admin/article API endpoints Backend improvements: - scraper_v2.py: Multi-layer fallback extraction (newspaper → trafilatura → readability) - translator_v2.py: Better chunking, repetition detection, validation - admin_tools.py: CLI admin commands - test_scraper.py: Individual source testing Docs: - WEB-ADMIN-GUIDE.md: Web admin usage - ADMIN-GUIDE.md: CLI admin usage - SCRAPER-IMPROVEMENT-PLAN.md: Scraper fixes details - TRANSLATION-FIX.md: Translation improvements - ADMIN-FEATURES-SUMMARY.md: Implementation summary Fixes: - Article scraping from 0 → 96+ articles working - Translation quality issues (repetition, truncation) - Added 13 new RSS sources
353 lines
14 KiB
Python
353 lines
14 KiB
Python
# Improved Burmese translation module with better error handling
|
|
|
|
from typing import Dict, Optional
|
|
from loguru import logger
|
|
import anthropic
|
|
import re
|
|
import config
|
|
import time
|
|
|
|
class BurmeseTranslator:
|
|
def __init__(self):
|
|
self.client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)
|
|
self.preserve_terms = config.TRANSLATION['preserve_terms']
|
|
|
|
def translate_article(self, article: Dict) -> Dict:
|
|
"""Translate compiled article to Burmese"""
|
|
logger.info(f"Translating article: {article['title'][:50]}...")
|
|
|
|
try:
|
|
# Translate title
|
|
title_burmese = self.translate_text(
|
|
text=article['title'],
|
|
context="This is an article title about AI technology",
|
|
max_length=200
|
|
)
|
|
|
|
# Translate excerpt
|
|
excerpt_burmese = self.translate_text(
|
|
text=article['excerpt'],
|
|
context="This is a brief article summary",
|
|
max_length=300
|
|
)
|
|
|
|
# Translate main content with improved chunking
|
|
content_burmese = self.translate_long_text(
|
|
article['content'],
|
|
chunk_size=1200 # Reduced from 2000 for safety
|
|
)
|
|
|
|
# Validate translation quality
|
|
if not self.validate_translation(content_burmese, article['content']):
|
|
logger.warning(f"Translation validation failed, using fallback")
|
|
# Try again with smaller chunks
|
|
content_burmese = self.translate_long_text(
|
|
article['content'],
|
|
chunk_size=800 # Even smaller
|
|
)
|
|
|
|
# Return article with Burmese translations
|
|
return {
|
|
**article,
|
|
'title_burmese': title_burmese,
|
|
'excerpt_burmese': excerpt_burmese,
|
|
'content_burmese': content_burmese
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Translation error: {e}")
|
|
# Fallback: return original text if translation fails
|
|
return {
|
|
**article,
|
|
'title_burmese': article['title'],
|
|
'excerpt_burmese': article['excerpt'],
|
|
'content_burmese': article['content']
|
|
}
|
|
|
|
def translate_text(self, text: str, context: str = "", max_length: int = None) -> str:
|
|
"""Translate a text block to Burmese with improved prompting"""
|
|
|
|
# Build preserved terms list
|
|
preserved_terms_str = ", ".join(self.preserve_terms)
|
|
|
|
# Add length guidance if specified
|
|
length_guidance = ""
|
|
if max_length:
|
|
length_guidance = f"\n⚠️ IMPORTANT: Keep translation under {max_length} words. Be concise."
|
|
|
|
prompt = f"""Translate the following English text to Burmese (Myanmar Unicode) in a CASUAL, EASY-TO-READ style.
|
|
|
|
🎯 CRITICAL GUIDELINES:
|
|
1. Write in **CASUAL, CONVERSATIONAL Burmese** - like talking to a friend
|
|
2. Use **SIMPLE, EVERYDAY words** - avoid formal or academic language
|
|
3. Explain technical concepts in **LAYMAN TERMS**
|
|
4. Keep these terms in English: {preserved_terms_str}
|
|
5. Add **brief explanations** in parentheses for complex terms
|
|
6. Use **short sentences** - easy to read on mobile
|
|
7. Break up long paragraphs - white space is good
|
|
8. Keep markdown formatting (##, **, -, etc.) intact{length_guidance}
|
|
|
|
🚫 CRITICAL: DO NOT REPEAT TEXT OR GET STUCK IN LOOPS!
|
|
- If you start repeating, STOP immediately
|
|
- Translate fully but concisely
|
|
- Each sentence should be unique
|
|
|
|
TARGET AUDIENCE: General Myanmar public curious about AI
|
|
|
|
Context: {context}
|
|
|
|
Text to translate:
|
|
{text}
|
|
|
|
Burmese translation (natural, concise, no repetitions):"""
|
|
|
|
try:
|
|
message = self.client.messages.create(
|
|
model=config.TRANSLATION['model'],
|
|
max_tokens=min(config.TRANSLATION['max_tokens'], 3000), # Cap at 3000
|
|
temperature=config.TRANSLATION['temperature'],
|
|
messages=[{"role": "user", "content": prompt}]
|
|
)
|
|
|
|
translated = message.content[0].text.strip()
|
|
|
|
# Post-process and validate
|
|
translated = self.post_process_translation(translated)
|
|
|
|
# Check for hallucination/loops
|
|
if self.detect_repetition(translated):
|
|
logger.warning("Detected repetitive text, retrying with lower temperature")
|
|
# Retry with lower temperature
|
|
message = self.client.messages.create(
|
|
model=config.TRANSLATION['model'],
|
|
max_tokens=min(config.TRANSLATION['max_tokens'], 3000),
|
|
temperature=0.3, # Lower temperature
|
|
messages=[{"role": "user", "content": prompt}]
|
|
)
|
|
translated = message.content[0].text.strip()
|
|
translated = self.post_process_translation(translated)
|
|
|
|
return translated
|
|
|
|
except Exception as e:
|
|
logger.error(f"API translation error: {e}")
|
|
return text # Fallback to original
|
|
|
|
def translate_long_text(self, text: str, chunk_size: int = 1200) -> str:
|
|
"""Translate long text in chunks with better error handling"""
|
|
|
|
# If text is short enough, translate directly
|
|
if len(text) < chunk_size:
|
|
return self.translate_text(text, context="This is the main article content")
|
|
|
|
logger.info(f"Article is {len(text)} chars, splitting into chunks...")
|
|
|
|
# Split into paragraphs first
|
|
paragraphs = text.split('\n\n')
|
|
|
|
# Group paragraphs into chunks (more conservative sizing)
|
|
chunks = []
|
|
current_chunk = ""
|
|
|
|
for para in paragraphs:
|
|
# Check if adding this paragraph would exceed chunk size
|
|
if len(current_chunk) + len(para) + 4 < chunk_size: # +4 for \n\n
|
|
if current_chunk:
|
|
current_chunk += '\n\n' + para
|
|
else:
|
|
current_chunk = para
|
|
else:
|
|
# Current chunk is full, save it
|
|
if current_chunk:
|
|
chunks.append(current_chunk.strip())
|
|
|
|
# Start new chunk with this paragraph
|
|
# If paragraph itself is too long, split it further
|
|
if len(para) > chunk_size:
|
|
# Split long paragraph by sentences
|
|
sentences = para.split('. ')
|
|
temp_chunk = ""
|
|
for sent in sentences:
|
|
if len(temp_chunk) + len(sent) + 2 < chunk_size:
|
|
temp_chunk += sent + '. '
|
|
else:
|
|
if temp_chunk:
|
|
chunks.append(temp_chunk.strip())
|
|
temp_chunk = sent + '. '
|
|
current_chunk = temp_chunk
|
|
else:
|
|
current_chunk = para
|
|
|
|
# Don't forget the last chunk
|
|
if current_chunk:
|
|
chunks.append(current_chunk.strip())
|
|
|
|
logger.info(f"Split into {len(chunks)} chunks (avg {len(text)//len(chunks)} chars each)")
|
|
|
|
# Translate each chunk with progress tracking
|
|
translated_chunks = []
|
|
failed_chunks = 0
|
|
|
|
for i, chunk in enumerate(chunks):
|
|
logger.info(f"Translating chunk {i+1}/{len(chunks)} ({len(chunk)} chars)...")
|
|
|
|
try:
|
|
translated = self.translate_text(
|
|
chunk,
|
|
context=f"This is part {i+1} of {len(chunks)} of a longer article"
|
|
)
|
|
|
|
# Validate chunk translation
|
|
if self.detect_repetition(translated):
|
|
logger.warning(f"Chunk {i+1} has repetition, retrying...")
|
|
time.sleep(1)
|
|
translated = self.translate_text(
|
|
chunk,
|
|
context=f"This is part {i+1} of {len(chunks)} - translate fully without repetition"
|
|
)
|
|
|
|
translated_chunks.append(translated)
|
|
time.sleep(0.5) # Rate limiting
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to translate chunk {i+1}: {e}")
|
|
failed_chunks += 1
|
|
# Use original text as fallback for this chunk
|
|
translated_chunks.append(chunk)
|
|
time.sleep(1)
|
|
|
|
if failed_chunks > 0:
|
|
logger.warning(f"{failed_chunks}/{len(chunks)} chunks failed translation")
|
|
|
|
# Join chunks
|
|
result = '\n\n'.join(translated_chunks)
|
|
logger.info(f"Translation complete: {len(result)} chars (original: {len(text)} chars)")
|
|
|
|
return result
|
|
|
|
def detect_repetition(self, text: str, threshold: int = 5) -> bool:
|
|
"""Detect if text has repetitive patterns (hallucination)"""
|
|
if len(text) < 100:
|
|
return False
|
|
|
|
# Check for repeated phrases (5+ words)
|
|
words = text.split()
|
|
if len(words) < 10:
|
|
return False
|
|
|
|
# Look for 5-word sequences that appear multiple times
|
|
sequences = {}
|
|
for i in range(len(words) - 4):
|
|
seq = ' '.join(words[i:i+5])
|
|
sequences[seq] = sequences.get(seq, 0) + 1
|
|
|
|
# If any sequence appears 3+ times, it's likely repetition
|
|
max_repetitions = max(sequences.values()) if sequences else 0
|
|
|
|
if max_repetitions >= threshold:
|
|
logger.warning(f"Detected repetition: {max_repetitions} occurrences")
|
|
return True
|
|
|
|
return False
|
|
|
|
def validate_translation(self, translated: str, original: str) -> bool:
|
|
"""Validate translation quality"""
|
|
|
|
# Check 1: Not empty
|
|
if not translated or len(translated) < 50:
|
|
logger.warning("Translation too short")
|
|
return False
|
|
|
|
# Check 2: Has Burmese Unicode
|
|
if not self.validate_burmese_text(translated):
|
|
logger.warning("Translation missing Burmese text")
|
|
return False
|
|
|
|
# Check 3: Reasonable length ratio (translated should be 50-200% of original)
|
|
ratio = len(translated) / len(original)
|
|
if ratio < 0.3 or ratio > 3.0:
|
|
logger.warning(f"Translation length ratio suspicious: {ratio:.2f}")
|
|
return False
|
|
|
|
# Check 4: No repetition
|
|
if self.detect_repetition(translated):
|
|
logger.warning("Translation has repetitive patterns")
|
|
return False
|
|
|
|
return True
|
|
|
|
def post_process_translation(self, text: str) -> str:
|
|
"""Clean up and validate translation"""
|
|
|
|
# Remove excessive newlines
|
|
text = re.sub(r'(\n{3,})', '\n\n', text)
|
|
|
|
# Remove leading/trailing whitespace from each line
|
|
lines = [line.strip() for line in text.split('\n')]
|
|
text = '\n'.join(lines)
|
|
|
|
# Ensure proper spacing after Burmese punctuation
|
|
text = re.sub(r'([။၊])([^\s])', r'\1 \2', text)
|
|
|
|
# Remove any accidental English remnants that shouldn't be there
|
|
# (but preserve the terms we want to keep)
|
|
|
|
return text.strip()
|
|
|
|
def validate_burmese_text(self, text: str) -> bool:
|
|
"""Check if text contains valid Burmese Unicode"""
|
|
# Myanmar Unicode range: U+1000 to U+109F
|
|
burmese_pattern = re.compile(r'[\u1000-\u109F]')
|
|
return bool(burmese_pattern.search(text))
|
|
|
|
def run_translator(compiled_articles: list) -> list:
|
|
"""Translate compiled articles to Burmese"""
|
|
logger.info(f"Starting translator for {len(compiled_articles)} articles...")
|
|
start_time = time.time()
|
|
|
|
try:
|
|
translator = BurmeseTranslator()
|
|
translated_articles = []
|
|
|
|
for i, article in enumerate(compiled_articles, 1):
|
|
logger.info(f"Translating article {i}/{len(compiled_articles)}")
|
|
|
|
try:
|
|
translated_article = translator.translate_article(article)
|
|
translated_articles.append(translated_article)
|
|
logger.info(f"✓ Translation successful for article {i}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to translate article {i}: {e}")
|
|
# Add article with original English text as fallback
|
|
translated_articles.append({
|
|
**article,
|
|
'title_burmese': article['title'],
|
|
'excerpt_burmese': article['excerpt'],
|
|
'content_burmese': article['content']
|
|
})
|
|
|
|
duration = int(time.time() - start_time)
|
|
logger.info(f"Translator completed in {duration}s. Articles translated: {len(translated_articles)}")
|
|
|
|
return translated_articles
|
|
|
|
except Exception as e:
|
|
logger.error(f"Translator failed: {e}")
|
|
return compiled_articles # Return originals as fallback
|
|
|
|
if __name__ == '__main__':
|
|
# Test the translator
|
|
test_article = {
|
|
'title': 'Test Article About AI',
|
|
'excerpt': 'This is a test excerpt about artificial intelligence.',
|
|
'content': 'This is test content. ' * 100 # Long content
|
|
}
|
|
|
|
translator = BurmeseTranslator()
|
|
result = translator.translate_article(test_article)
|
|
|
|
print("Title:", result['title_burmese'])
|
|
print("Excerpt:", result['excerpt_burmese'])
|
|
print("Content length:", len(result['content_burmese']))
|