Files
burmddit/backend/translator.py

256 lines
9.5 KiB
Python

# Burmese translation module using Claude
from typing import Dict, Optional
from loguru import logger
import anthropic
import re
import config
import time
class BurmeseTranslator:
def __init__(self):
self.client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)
self.preserve_terms = config.TRANSLATION['preserve_terms']
def translate_article(self, article: Dict) -> Dict:
"""Translate compiled article to Burmese"""
logger.info(f"Translating article: {article['title'][:50]}...")
try:
# Translate title
title_burmese = self.translate_text(
text=article['title'],
context="This is an article title about AI technology"
)
# Translate excerpt
excerpt_burmese = self.translate_text(
text=article['excerpt'],
context="This is a brief article summary"
)
# Translate main content (in chunks if too long)
content_burmese = self.translate_long_text(article['content'])
# Return article with Burmese translations
return {
**article,
'title_burmese': title_burmese,
'excerpt_burmese': excerpt_burmese,
'content_burmese': content_burmese
}
except Exception as e:
logger.error(f"Translation error: {e}")
# Fallback: return original text if translation fails
return {
**article,
'title_burmese': article['title'],
'excerpt_burmese': article['excerpt'],
'content_burmese': article['content']
}
def translate_text(self, text: str, context: str = "") -> str:
"""Translate a text block to Burmese"""
# Build preserved terms list for this text
preserved_terms_str = ", ".join(self.preserve_terms)
prompt = f"""Translate the following English text to Burmese (Myanmar Unicode) in a CASUAL, EASY-TO-READ style.
🎯 CRITICAL GUIDELINES:
1. Write in **CASUAL, CONVERSATIONAL Burmese** - like talking to a friend over tea
2. Use **SIMPLE, EVERYDAY words** - avoid formal or academic language
3. Explain technical concepts in **LAYMAN TERMS** - as if explaining to your grandmother
4. Keep these terms in English: {preserved_terms_str}
5. Add **brief explanations** in parentheses for complex terms
6. Use **short sentences** - easy to read on mobile
7. Break up long paragraphs - white space is good
8. Keep markdown formatting (##, **, -, etc.) intact
TARGET AUDIENCE: General Myanmar public who are curious about AI but not tech experts
TONE: Friendly, approachable, informative but not boring
EXAMPLE STYLE:
❌ Bad (too formal): "ယခု နည်းပညာသည် ဉာဏ်ရည်တု ဖြစ်စဉ်များကို အသုံးပြုပါသည်"
✅ Good (casual): "ဒီနည်းပညာက AI (အထက်တန်းကွန်ပျူတာဦးနှောက်) ကို သုံးတာပါ"
Context: {context}
Text to translate:
{text}
Casual, easy-to-read Burmese translation:"""
try:
message = self.client.messages.create(
model=config.TRANSLATION['model'],
max_tokens=config.TRANSLATION['max_tokens'],
temperature=config.TRANSLATION['temperature'],
messages=[{"role": "user", "content": prompt}]
)
translated = message.content[0].text.strip()
# Post-process: ensure Unicode and clean up
translated = self.post_process_translation(translated)
return translated
except Exception as e:
logger.error(f"API translation error: {e}")
return text # Fallback to original
def translate_long_text(self, text: str, chunk_size: int = 2000) -> str:
"""Translate long text in chunks to stay within token limits"""
# If text is short enough, translate directly
if len(text) < chunk_size:
return self.translate_text(text, context="This is the main article content")
# Split into paragraphs
paragraphs = text.split('\n\n')
# Group paragraphs into chunks
chunks = []
current_chunk = ""
for para in paragraphs:
if len(current_chunk) + len(para) < chunk_size:
current_chunk += para + '\n\n'
else:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = para + '\n\n'
if current_chunk:
chunks.append(current_chunk.strip())
logger.info(f"Translating {len(chunks)} chunks...")
# Translate each chunk
translated_chunks = []
for i, chunk in enumerate(chunks):
logger.debug(f"Translating chunk {i+1}/{len(chunks)}")
translated = self.translate_text(
chunk,
context=f"This is part {i+1} of {len(chunks)} of a longer article"
)
translated_chunks.append(translated)
time.sleep(0.5) # Rate limiting
# Join chunks
return '\n\n'.join(translated_chunks)
def post_process_translation(self, text: str) -> str:
"""Clean up and validate translation"""
# Remove any accidental duplication
text = re.sub(r'(\n{3,})', '\n\n', text)
# Ensure proper spacing after punctuation
text = re.sub(r'([။၊])([^\s])', r'\1 \2', text)
# Preserve preserved terms (fix any that got translated)
for term in self.preserve_terms:
# If the term appears in a weird form, try to fix it
# (This is a simple check; more sophisticated matching could be added)
if term not in text and term.lower() in text.lower():
text = re.sub(re.escape(term.lower()), term, text, flags=re.IGNORECASE)
return text.strip()
def validate_burmese_text(self, text: str) -> bool:
"""Check if text contains valid Burmese Unicode"""
# Myanmar Unicode range: U+1000 to U+109F
burmese_pattern = re.compile(r'[\u1000-\u109F]')
return bool(burmese_pattern.search(text))
def run_translator(compiled_articles: list) -> list:
"""Translate compiled articles to Burmese"""
logger.info(f"Starting translator for {len(compiled_articles)} articles...")
start_time = time.time()
try:
translator = BurmeseTranslator()
translated_articles = []
for i, article in enumerate(compiled_articles, 1):
logger.info(f"Translating article {i}/{len(compiled_articles)}")
try:
translated = translator.translate_article(article)
# Validate translation
if translator.validate_burmese_text(translated['content_burmese']):
translated_articles.append(translated)
logger.info(f"✓ Translation successful for article {i}")
else:
logger.warning(f"✗ Translation validation failed for article {i}")
# Still add it, but flag it
translated_articles.append(translated)
time.sleep(1) # Rate limiting
except Exception as e:
logger.error(f"Error translating article {i}: {e}")
continue
duration = int(time.time() - start_time)
from database import log_pipeline_stage
log_pipeline_stage(
stage='translate',
status='completed',
articles_processed=len(translated_articles),
duration=duration
)
logger.info(f"Translator completed in {duration}s. Articles translated: {len(translated_articles)}")
return translated_articles
except Exception as e:
logger.error(f"Translator failed: {e}")
from database import log_pipeline_stage
log_pipeline_stage(
stage='translate',
status='failed',
error_message=str(e)
)
return []
if __name__ == '__main__':
from loguru import logger
logger.add(config.LOG_FILE, rotation="1 day")
# Test translation
test_article = {
'title': 'OpenAI Releases GPT-5: A New Era of AI',
'excerpt': 'OpenAI today announced GPT-5, the next generation of their language model.',
'content': '''OpenAI has officially released GPT-5, marking a significant milestone in artificial intelligence development.
## Key Features
The new model includes:
- 10x more parameters than GPT-4
- Better reasoning capabilities
- Multimodal support for video
- Reduced hallucinations
CEO Sam Altman said, "GPT-5 represents our most advanced AI system yet."
The model will be available to ChatGPT Plus subscribers starting next month.'''
}
translator = BurmeseTranslator()
translated = translator.translate_article(test_article)
print("\n=== ORIGINAL ===")
print(f"Title: {translated['title']}")
print(f"\nContent: {translated['content'][:200]}...")
print("\n=== BURMESE ===")
print(f"Title: {translated['title_burmese']}")
print(f"\nContent: {translated['content_burmese'][:200]}...")