256 lines
9.5 KiB
Python
256 lines
9.5 KiB
Python
# Burmese translation module using Claude
|
|
|
|
from typing import Dict, Optional
|
|
from loguru import logger
|
|
import anthropic
|
|
import re
|
|
import config
|
|
import time
|
|
|
|
class BurmeseTranslator:
|
|
def __init__(self):
|
|
self.client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)
|
|
self.preserve_terms = config.TRANSLATION['preserve_terms']
|
|
|
|
def translate_article(self, article: Dict) -> Dict:
|
|
"""Translate compiled article to Burmese"""
|
|
logger.info(f"Translating article: {article['title'][:50]}...")
|
|
|
|
try:
|
|
# Translate title
|
|
title_burmese = self.translate_text(
|
|
text=article['title'],
|
|
context="This is an article title about AI technology"
|
|
)
|
|
|
|
# Translate excerpt
|
|
excerpt_burmese = self.translate_text(
|
|
text=article['excerpt'],
|
|
context="This is a brief article summary"
|
|
)
|
|
|
|
# Translate main content (in chunks if too long)
|
|
content_burmese = self.translate_long_text(article['content'])
|
|
|
|
# Return article with Burmese translations
|
|
return {
|
|
**article,
|
|
'title_burmese': title_burmese,
|
|
'excerpt_burmese': excerpt_burmese,
|
|
'content_burmese': content_burmese
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Translation error: {e}")
|
|
# Fallback: return original text if translation fails
|
|
return {
|
|
**article,
|
|
'title_burmese': article['title'],
|
|
'excerpt_burmese': article['excerpt'],
|
|
'content_burmese': article['content']
|
|
}
|
|
|
|
def translate_text(self, text: str, context: str = "") -> str:
|
|
"""Translate a text block to Burmese"""
|
|
|
|
# Build preserved terms list for this text
|
|
preserved_terms_str = ", ".join(self.preserve_terms)
|
|
|
|
prompt = f"""Translate the following English text to Burmese (Myanmar Unicode) in a CASUAL, EASY-TO-READ style.
|
|
|
|
🎯 CRITICAL GUIDELINES:
|
|
1. Write in **CASUAL, CONVERSATIONAL Burmese** - like talking to a friend over tea
|
|
2. Use **SIMPLE, EVERYDAY words** - avoid formal or academic language
|
|
3. Explain technical concepts in **LAYMAN TERMS** - as if explaining to your grandmother
|
|
4. Keep these terms in English: {preserved_terms_str}
|
|
5. Add **brief explanations** in parentheses for complex terms
|
|
6. Use **short sentences** - easy to read on mobile
|
|
7. Break up long paragraphs - white space is good
|
|
8. Keep markdown formatting (##, **, -, etc.) intact
|
|
|
|
TARGET AUDIENCE: General Myanmar public who are curious about AI but not tech experts
|
|
|
|
TONE: Friendly, approachable, informative but not boring
|
|
|
|
EXAMPLE STYLE:
|
|
❌ Bad (too formal): "ယခု နည်းပညာသည် ဉာဏ်ရည်တု ဖြစ်စဉ်များကို အသုံးပြုပါသည်"
|
|
✅ Good (casual): "ဒီနည်းပညာက AI (အထက်တန်းကွန်ပျူတာဦးနှောက်) ကို သုံးတာပါ"
|
|
|
|
Context: {context}
|
|
|
|
Text to translate:
|
|
{text}
|
|
|
|
Casual, easy-to-read Burmese translation:"""
|
|
|
|
try:
|
|
message = self.client.messages.create(
|
|
model=config.TRANSLATION['model'],
|
|
max_tokens=config.TRANSLATION['max_tokens'],
|
|
temperature=config.TRANSLATION['temperature'],
|
|
messages=[{"role": "user", "content": prompt}]
|
|
)
|
|
|
|
translated = message.content[0].text.strip()
|
|
|
|
# Post-process: ensure Unicode and clean up
|
|
translated = self.post_process_translation(translated)
|
|
|
|
return translated
|
|
|
|
except Exception as e:
|
|
logger.error(f"API translation error: {e}")
|
|
return text # Fallback to original
|
|
|
|
def translate_long_text(self, text: str, chunk_size: int = 2000) -> str:
|
|
"""Translate long text in chunks to stay within token limits"""
|
|
|
|
# If text is short enough, translate directly
|
|
if len(text) < chunk_size:
|
|
return self.translate_text(text, context="This is the main article content")
|
|
|
|
# Split into paragraphs
|
|
paragraphs = text.split('\n\n')
|
|
|
|
# Group paragraphs into chunks
|
|
chunks = []
|
|
current_chunk = ""
|
|
|
|
for para in paragraphs:
|
|
if len(current_chunk) + len(para) < chunk_size:
|
|
current_chunk += para + '\n\n'
|
|
else:
|
|
if current_chunk:
|
|
chunks.append(current_chunk.strip())
|
|
current_chunk = para + '\n\n'
|
|
|
|
if current_chunk:
|
|
chunks.append(current_chunk.strip())
|
|
|
|
logger.info(f"Translating {len(chunks)} chunks...")
|
|
|
|
# Translate each chunk
|
|
translated_chunks = []
|
|
for i, chunk in enumerate(chunks):
|
|
logger.debug(f"Translating chunk {i+1}/{len(chunks)}")
|
|
translated = self.translate_text(
|
|
chunk,
|
|
context=f"This is part {i+1} of {len(chunks)} of a longer article"
|
|
)
|
|
translated_chunks.append(translated)
|
|
time.sleep(0.5) # Rate limiting
|
|
|
|
# Join chunks
|
|
return '\n\n'.join(translated_chunks)
|
|
|
|
def post_process_translation(self, text: str) -> str:
|
|
"""Clean up and validate translation"""
|
|
|
|
# Remove any accidental duplication
|
|
text = re.sub(r'(\n{3,})', '\n\n', text)
|
|
|
|
# Ensure proper spacing after punctuation
|
|
text = re.sub(r'([။၊])([^\s])', r'\1 \2', text)
|
|
|
|
# Preserve preserved terms (fix any that got translated)
|
|
for term in self.preserve_terms:
|
|
# If the term appears in a weird form, try to fix it
|
|
# (This is a simple check; more sophisticated matching could be added)
|
|
if term not in text and term.lower() in text.lower():
|
|
text = re.sub(re.escape(term.lower()), term, text, flags=re.IGNORECASE)
|
|
|
|
return text.strip()
|
|
|
|
def validate_burmese_text(self, text: str) -> bool:
|
|
"""Check if text contains valid Burmese Unicode"""
|
|
# Myanmar Unicode range: U+1000 to U+109F
|
|
burmese_pattern = re.compile(r'[\u1000-\u109F]')
|
|
return bool(burmese_pattern.search(text))
|
|
|
|
def run_translator(compiled_articles: list) -> list:
|
|
"""Translate compiled articles to Burmese"""
|
|
logger.info(f"Starting translator for {len(compiled_articles)} articles...")
|
|
start_time = time.time()
|
|
|
|
try:
|
|
translator = BurmeseTranslator()
|
|
translated_articles = []
|
|
|
|
for i, article in enumerate(compiled_articles, 1):
|
|
logger.info(f"Translating article {i}/{len(compiled_articles)}")
|
|
|
|
try:
|
|
translated = translator.translate_article(article)
|
|
|
|
# Validate translation
|
|
if translator.validate_burmese_text(translated['content_burmese']):
|
|
translated_articles.append(translated)
|
|
logger.info(f"✓ Translation successful for article {i}")
|
|
else:
|
|
logger.warning(f"✗ Translation validation failed for article {i}")
|
|
# Still add it, but flag it
|
|
translated_articles.append(translated)
|
|
|
|
time.sleep(1) # Rate limiting
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error translating article {i}: {e}")
|
|
continue
|
|
|
|
duration = int(time.time() - start_time)
|
|
|
|
from database import log_pipeline_stage
|
|
log_pipeline_stage(
|
|
stage='translate',
|
|
status='completed',
|
|
articles_processed=len(translated_articles),
|
|
duration=duration
|
|
)
|
|
|
|
logger.info(f"Translator completed in {duration}s. Articles translated: {len(translated_articles)}")
|
|
return translated_articles
|
|
|
|
except Exception as e:
|
|
logger.error(f"Translator failed: {e}")
|
|
from database import log_pipeline_stage
|
|
log_pipeline_stage(
|
|
stage='translate',
|
|
status='failed',
|
|
error_message=str(e)
|
|
)
|
|
return []
|
|
|
|
if __name__ == '__main__':
|
|
from loguru import logger
|
|
logger.add(config.LOG_FILE, rotation="1 day")
|
|
|
|
# Test translation
|
|
test_article = {
|
|
'title': 'OpenAI Releases GPT-5: A New Era of AI',
|
|
'excerpt': 'OpenAI today announced GPT-5, the next generation of their language model.',
|
|
'content': '''OpenAI has officially released GPT-5, marking a significant milestone in artificial intelligence development.
|
|
|
|
## Key Features
|
|
|
|
The new model includes:
|
|
- 10x more parameters than GPT-4
|
|
- Better reasoning capabilities
|
|
- Multimodal support for video
|
|
- Reduced hallucinations
|
|
|
|
CEO Sam Altman said, "GPT-5 represents our most advanced AI system yet."
|
|
|
|
The model will be available to ChatGPT Plus subscribers starting next month.'''
|
|
}
|
|
|
|
translator = BurmeseTranslator()
|
|
translated = translator.translate_article(test_article)
|
|
|
|
print("\n=== ORIGINAL ===")
|
|
print(f"Title: {translated['title']}")
|
|
print(f"\nContent: {translated['content'][:200]}...")
|
|
|
|
print("\n=== BURMESE ===")
|
|
print(f"Title: {translated['title_burmese']}")
|
|
print(f"\nContent: {translated['content_burmese'][:200]}...")
|