Files
burmddit/backend/translator_v2.py
Zeya Phyo f51ac4afa4 Add web admin features + fix scraper & translator
Frontend changes:
- Add /admin dashboard for article management
- Add AdminButton component (Alt+Shift+A on articles)
- Add /api/admin/article API endpoints

Backend improvements:
- scraper_v2.py: Multi-layer fallback extraction (newspaper → trafilatura → readability)
- translator_v2.py: Better chunking, repetition detection, validation
- admin_tools.py: CLI admin commands
- test_scraper.py: Individual source testing

Docs:
- WEB-ADMIN-GUIDE.md: Web admin usage
- ADMIN-GUIDE.md: CLI admin usage
- SCRAPER-IMPROVEMENT-PLAN.md: Scraper fixes details
- TRANSLATION-FIX.md: Translation improvements
- ADMIN-FEATURES-SUMMARY.md: Implementation summary

Fixes:
- Article scraping from 0 → 96+ articles working
- Translation quality issues (repetition, truncation)
- Added 13 new RSS sources
2026-02-26 09:17:50 +00:00

353 lines
14 KiB
Python

# Improved Burmese translation module with better error handling
from typing import Dict, Optional
from loguru import logger
import anthropic
import re
import config
import time
class BurmeseTranslator:
def __init__(self):
self.client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)
self.preserve_terms = config.TRANSLATION['preserve_terms']
def translate_article(self, article: Dict) -> Dict:
"""Translate compiled article to Burmese"""
logger.info(f"Translating article: {article['title'][:50]}...")
try:
# Translate title
title_burmese = self.translate_text(
text=article['title'],
context="This is an article title about AI technology",
max_length=200
)
# Translate excerpt
excerpt_burmese = self.translate_text(
text=article['excerpt'],
context="This is a brief article summary",
max_length=300
)
# Translate main content with improved chunking
content_burmese = self.translate_long_text(
article['content'],
chunk_size=1200 # Reduced from 2000 for safety
)
# Validate translation quality
if not self.validate_translation(content_burmese, article['content']):
logger.warning(f"Translation validation failed, using fallback")
# Try again with smaller chunks
content_burmese = self.translate_long_text(
article['content'],
chunk_size=800 # Even smaller
)
# Return article with Burmese translations
return {
**article,
'title_burmese': title_burmese,
'excerpt_burmese': excerpt_burmese,
'content_burmese': content_burmese
}
except Exception as e:
logger.error(f"Translation error: {e}")
# Fallback: return original text if translation fails
return {
**article,
'title_burmese': article['title'],
'excerpt_burmese': article['excerpt'],
'content_burmese': article['content']
}
def translate_text(self, text: str, context: str = "", max_length: int = None) -> str:
"""Translate a text block to Burmese with improved prompting"""
# Build preserved terms list
preserved_terms_str = ", ".join(self.preserve_terms)
# Add length guidance if specified
length_guidance = ""
if max_length:
length_guidance = f"\n⚠️ IMPORTANT: Keep translation under {max_length} words. Be concise."
prompt = f"""Translate the following English text to Burmese (Myanmar Unicode) in a CASUAL, EASY-TO-READ style.
🎯 CRITICAL GUIDELINES:
1. Write in **CASUAL, CONVERSATIONAL Burmese** - like talking to a friend
2. Use **SIMPLE, EVERYDAY words** - avoid formal or academic language
3. Explain technical concepts in **LAYMAN TERMS**
4. Keep these terms in English: {preserved_terms_str}
5. Add **brief explanations** in parentheses for complex terms
6. Use **short sentences** - easy to read on mobile
7. Break up long paragraphs - white space is good
8. Keep markdown formatting (##, **, -, etc.) intact{length_guidance}
🚫 CRITICAL: DO NOT REPEAT TEXT OR GET STUCK IN LOOPS!
- If you start repeating, STOP immediately
- Translate fully but concisely
- Each sentence should be unique
TARGET AUDIENCE: General Myanmar public curious about AI
Context: {context}
Text to translate:
{text}
Burmese translation (natural, concise, no repetitions):"""
try:
message = self.client.messages.create(
model=config.TRANSLATION['model'],
max_tokens=min(config.TRANSLATION['max_tokens'], 3000), # Cap at 3000
temperature=config.TRANSLATION['temperature'],
messages=[{"role": "user", "content": prompt}]
)
translated = message.content[0].text.strip()
# Post-process and validate
translated = self.post_process_translation(translated)
# Check for hallucination/loops
if self.detect_repetition(translated):
logger.warning("Detected repetitive text, retrying with lower temperature")
# Retry with lower temperature
message = self.client.messages.create(
model=config.TRANSLATION['model'],
max_tokens=min(config.TRANSLATION['max_tokens'], 3000),
temperature=0.3, # Lower temperature
messages=[{"role": "user", "content": prompt}]
)
translated = message.content[0].text.strip()
translated = self.post_process_translation(translated)
return translated
except Exception as e:
logger.error(f"API translation error: {e}")
return text # Fallback to original
def translate_long_text(self, text: str, chunk_size: int = 1200) -> str:
"""Translate long text in chunks with better error handling"""
# If text is short enough, translate directly
if len(text) < chunk_size:
return self.translate_text(text, context="This is the main article content")
logger.info(f"Article is {len(text)} chars, splitting into chunks...")
# Split into paragraphs first
paragraphs = text.split('\n\n')
# Group paragraphs into chunks (more conservative sizing)
chunks = []
current_chunk = ""
for para in paragraphs:
# Check if adding this paragraph would exceed chunk size
if len(current_chunk) + len(para) + 4 < chunk_size: # +4 for \n\n
if current_chunk:
current_chunk += '\n\n' + para
else:
current_chunk = para
else:
# Current chunk is full, save it
if current_chunk:
chunks.append(current_chunk.strip())
# Start new chunk with this paragraph
# If paragraph itself is too long, split it further
if len(para) > chunk_size:
# Split long paragraph by sentences
sentences = para.split('. ')
temp_chunk = ""
for sent in sentences:
if len(temp_chunk) + len(sent) + 2 < chunk_size:
temp_chunk += sent + '. '
else:
if temp_chunk:
chunks.append(temp_chunk.strip())
temp_chunk = sent + '. '
current_chunk = temp_chunk
else:
current_chunk = para
# Don't forget the last chunk
if current_chunk:
chunks.append(current_chunk.strip())
logger.info(f"Split into {len(chunks)} chunks (avg {len(text)//len(chunks)} chars each)")
# Translate each chunk with progress tracking
translated_chunks = []
failed_chunks = 0
for i, chunk in enumerate(chunks):
logger.info(f"Translating chunk {i+1}/{len(chunks)} ({len(chunk)} chars)...")
try:
translated = self.translate_text(
chunk,
context=f"This is part {i+1} of {len(chunks)} of a longer article"
)
# Validate chunk translation
if self.detect_repetition(translated):
logger.warning(f"Chunk {i+1} has repetition, retrying...")
time.sleep(1)
translated = self.translate_text(
chunk,
context=f"This is part {i+1} of {len(chunks)} - translate fully without repetition"
)
translated_chunks.append(translated)
time.sleep(0.5) # Rate limiting
except Exception as e:
logger.error(f"Failed to translate chunk {i+1}: {e}")
failed_chunks += 1
# Use original text as fallback for this chunk
translated_chunks.append(chunk)
time.sleep(1)
if failed_chunks > 0:
logger.warning(f"{failed_chunks}/{len(chunks)} chunks failed translation")
# Join chunks
result = '\n\n'.join(translated_chunks)
logger.info(f"Translation complete: {len(result)} chars (original: {len(text)} chars)")
return result
def detect_repetition(self, text: str, threshold: int = 5) -> bool:
"""Detect if text has repetitive patterns (hallucination)"""
if len(text) < 100:
return False
# Check for repeated phrases (5+ words)
words = text.split()
if len(words) < 10:
return False
# Look for 5-word sequences that appear multiple times
sequences = {}
for i in range(len(words) - 4):
seq = ' '.join(words[i:i+5])
sequences[seq] = sequences.get(seq, 0) + 1
# If any sequence appears 3+ times, it's likely repetition
max_repetitions = max(sequences.values()) if sequences else 0
if max_repetitions >= threshold:
logger.warning(f"Detected repetition: {max_repetitions} occurrences")
return True
return False
def validate_translation(self, translated: str, original: str) -> bool:
"""Validate translation quality"""
# Check 1: Not empty
if not translated or len(translated) < 50:
logger.warning("Translation too short")
return False
# Check 2: Has Burmese Unicode
if not self.validate_burmese_text(translated):
logger.warning("Translation missing Burmese text")
return False
# Check 3: Reasonable length ratio (translated should be 50-200% of original)
ratio = len(translated) / len(original)
if ratio < 0.3 or ratio > 3.0:
logger.warning(f"Translation length ratio suspicious: {ratio:.2f}")
return False
# Check 4: No repetition
if self.detect_repetition(translated):
logger.warning("Translation has repetitive patterns")
return False
return True
def post_process_translation(self, text: str) -> str:
"""Clean up and validate translation"""
# Remove excessive newlines
text = re.sub(r'(\n{3,})', '\n\n', text)
# Remove leading/trailing whitespace from each line
lines = [line.strip() for line in text.split('\n')]
text = '\n'.join(lines)
# Ensure proper spacing after Burmese punctuation
text = re.sub(r'([။၊])([^\s])', r'\1 \2', text)
# Remove any accidental English remnants that shouldn't be there
# (but preserve the terms we want to keep)
return text.strip()
def validate_burmese_text(self, text: str) -> bool:
"""Check if text contains valid Burmese Unicode"""
# Myanmar Unicode range: U+1000 to U+109F
burmese_pattern = re.compile(r'[\u1000-\u109F]')
return bool(burmese_pattern.search(text))
def run_translator(compiled_articles: list) -> list:
"""Translate compiled articles to Burmese"""
logger.info(f"Starting translator for {len(compiled_articles)} articles...")
start_time = time.time()
try:
translator = BurmeseTranslator()
translated_articles = []
for i, article in enumerate(compiled_articles, 1):
logger.info(f"Translating article {i}/{len(compiled_articles)}")
try:
translated_article = translator.translate_article(article)
translated_articles.append(translated_article)
logger.info(f"✓ Translation successful for article {i}")
except Exception as e:
logger.error(f"Failed to translate article {i}: {e}")
# Add article with original English text as fallback
translated_articles.append({
**article,
'title_burmese': article['title'],
'excerpt_burmese': article['excerpt'],
'content_burmese': article['content']
})
duration = int(time.time() - start_time)
logger.info(f"Translator completed in {duration}s. Articles translated: {len(translated_articles)}")
return translated_articles
except Exception as e:
logger.error(f"Translator failed: {e}")
return compiled_articles # Return originals as fallback
if __name__ == '__main__':
# Test the translator
test_article = {
'title': 'Test Article About AI',
'excerpt': 'This is a test excerpt about artificial intelligence.',
'content': 'This is test content. ' * 100 # Long content
}
translator = BurmeseTranslator()
result = translator.translate_article(test_article)
print("Title:", result['title_burmese'])
print("Excerpt:", result['excerpt_burmese'])
print("Content length:", len(result['content_burmese']))