Initial Burmddit deployment - AI news aggregator in Burmese
This commit is contained in:
255
backend/translator.py
Normal file
255
backend/translator.py
Normal file
@@ -0,0 +1,255 @@
|
||||
# Burmese translation module using Claude
|
||||
|
||||
from typing import Dict, Optional
|
||||
from loguru import logger
|
||||
import anthropic
|
||||
import re
|
||||
import config
|
||||
import time
|
||||
|
||||
class BurmeseTranslator:
|
||||
def __init__(self):
|
||||
self.client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)
|
||||
self.preserve_terms = config.TRANSLATION['preserve_terms']
|
||||
|
||||
def translate_article(self, article: Dict) -> Dict:
|
||||
"""Translate compiled article to Burmese"""
|
||||
logger.info(f"Translating article: {article['title'][:50]}...")
|
||||
|
||||
try:
|
||||
# Translate title
|
||||
title_burmese = self.translate_text(
|
||||
text=article['title'],
|
||||
context="This is an article title about AI technology"
|
||||
)
|
||||
|
||||
# Translate excerpt
|
||||
excerpt_burmese = self.translate_text(
|
||||
text=article['excerpt'],
|
||||
context="This is a brief article summary"
|
||||
)
|
||||
|
||||
# Translate main content (in chunks if too long)
|
||||
content_burmese = self.translate_long_text(article['content'])
|
||||
|
||||
# Return article with Burmese translations
|
||||
return {
|
||||
**article,
|
||||
'title_burmese': title_burmese,
|
||||
'excerpt_burmese': excerpt_burmese,
|
||||
'content_burmese': content_burmese
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Translation error: {e}")
|
||||
# Fallback: return original text if translation fails
|
||||
return {
|
||||
**article,
|
||||
'title_burmese': article['title'],
|
||||
'excerpt_burmese': article['excerpt'],
|
||||
'content_burmese': article['content']
|
||||
}
|
||||
|
||||
def translate_text(self, text: str, context: str = "") -> str:
|
||||
"""Translate a text block to Burmese"""
|
||||
|
||||
# Build preserved terms list for this text
|
||||
preserved_terms_str = ", ".join(self.preserve_terms)
|
||||
|
||||
prompt = f"""Translate the following English text to Burmese (Myanmar Unicode) in a CASUAL, EASY-TO-READ style.
|
||||
|
||||
🎯 CRITICAL GUIDELINES:
|
||||
1. Write in **CASUAL, CONVERSATIONAL Burmese** - like talking to a friend over tea
|
||||
2. Use **SIMPLE, EVERYDAY words** - avoid formal or academic language
|
||||
3. Explain technical concepts in **LAYMAN TERMS** - as if explaining to your grandmother
|
||||
4. Keep these terms in English: {preserved_terms_str}
|
||||
5. Add **brief explanations** in parentheses for complex terms
|
||||
6. Use **short sentences** - easy to read on mobile
|
||||
7. Break up long paragraphs - white space is good
|
||||
8. Keep markdown formatting (##, **, -, etc.) intact
|
||||
|
||||
TARGET AUDIENCE: General Myanmar public who are curious about AI but not tech experts
|
||||
|
||||
TONE: Friendly, approachable, informative but not boring
|
||||
|
||||
EXAMPLE STYLE:
|
||||
❌ Bad (too formal): "ယခု နည်းပညာသည် ဉာဏ်ရည်တု ဖြစ်စဉ်များကို အသုံးပြုပါသည်"
|
||||
✅ Good (casual): "ဒီနည်းပညာက AI (အထက်တန်းကွန်ပျူတာဦးနှောက်) ကို သုံးတာပါ"
|
||||
|
||||
Context: {context}
|
||||
|
||||
Text to translate:
|
||||
{text}
|
||||
|
||||
Casual, easy-to-read Burmese translation:"""
|
||||
|
||||
try:
|
||||
message = self.client.messages.create(
|
||||
model=config.TRANSLATION['model'],
|
||||
max_tokens=config.TRANSLATION['max_tokens'],
|
||||
temperature=config.TRANSLATION['temperature'],
|
||||
messages=[{"role": "user", "content": prompt}]
|
||||
)
|
||||
|
||||
translated = message.content[0].text.strip()
|
||||
|
||||
# Post-process: ensure Unicode and clean up
|
||||
translated = self.post_process_translation(translated)
|
||||
|
||||
return translated
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"API translation error: {e}")
|
||||
return text # Fallback to original
|
||||
|
||||
def translate_long_text(self, text: str, chunk_size: int = 2000) -> str:
|
||||
"""Translate long text in chunks to stay within token limits"""
|
||||
|
||||
# If text is short enough, translate directly
|
||||
if len(text) < chunk_size:
|
||||
return self.translate_text(text, context="This is the main article content")
|
||||
|
||||
# Split into paragraphs
|
||||
paragraphs = text.split('\n\n')
|
||||
|
||||
# Group paragraphs into chunks
|
||||
chunks = []
|
||||
current_chunk = ""
|
||||
|
||||
for para in paragraphs:
|
||||
if len(current_chunk) + len(para) < chunk_size:
|
||||
current_chunk += para + '\n\n'
|
||||
else:
|
||||
if current_chunk:
|
||||
chunks.append(current_chunk.strip())
|
||||
current_chunk = para + '\n\n'
|
||||
|
||||
if current_chunk:
|
||||
chunks.append(current_chunk.strip())
|
||||
|
||||
logger.info(f"Translating {len(chunks)} chunks...")
|
||||
|
||||
# Translate each chunk
|
||||
translated_chunks = []
|
||||
for i, chunk in enumerate(chunks):
|
||||
logger.debug(f"Translating chunk {i+1}/{len(chunks)}")
|
||||
translated = self.translate_text(
|
||||
chunk,
|
||||
context=f"This is part {i+1} of {len(chunks)} of a longer article"
|
||||
)
|
||||
translated_chunks.append(translated)
|
||||
time.sleep(0.5) # Rate limiting
|
||||
|
||||
# Join chunks
|
||||
return '\n\n'.join(translated_chunks)
|
||||
|
||||
def post_process_translation(self, text: str) -> str:
|
||||
"""Clean up and validate translation"""
|
||||
|
||||
# Remove any accidental duplication
|
||||
text = re.sub(r'(\n{3,})', '\n\n', text)
|
||||
|
||||
# Ensure proper spacing after punctuation
|
||||
text = re.sub(r'([။၊])([^\s])', r'\1 \2', text)
|
||||
|
||||
# Preserve preserved terms (fix any that got translated)
|
||||
for term in self.preserve_terms:
|
||||
# If the term appears in a weird form, try to fix it
|
||||
# (This is a simple check; more sophisticated matching could be added)
|
||||
if term not in text and term.lower() in text.lower():
|
||||
text = re.sub(re.escape(term.lower()), term, text, flags=re.IGNORECASE)
|
||||
|
||||
return text.strip()
|
||||
|
||||
def validate_burmese_text(self, text: str) -> bool:
|
||||
"""Check if text contains valid Burmese Unicode"""
|
||||
# Myanmar Unicode range: U+1000 to U+109F
|
||||
burmese_pattern = re.compile(r'[\u1000-\u109F]')
|
||||
return bool(burmese_pattern.search(text))
|
||||
|
||||
def run_translator(compiled_articles: list) -> list:
|
||||
"""Translate compiled articles to Burmese"""
|
||||
logger.info(f"Starting translator for {len(compiled_articles)} articles...")
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
translator = BurmeseTranslator()
|
||||
translated_articles = []
|
||||
|
||||
for i, article in enumerate(compiled_articles, 1):
|
||||
logger.info(f"Translating article {i}/{len(compiled_articles)}")
|
||||
|
||||
try:
|
||||
translated = translator.translate_article(article)
|
||||
|
||||
# Validate translation
|
||||
if translator.validate_burmese_text(translated['content_burmese']):
|
||||
translated_articles.append(translated)
|
||||
logger.info(f"✓ Translation successful for article {i}")
|
||||
else:
|
||||
logger.warning(f"✗ Translation validation failed for article {i}")
|
||||
# Still add it, but flag it
|
||||
translated_articles.append(translated)
|
||||
|
||||
time.sleep(1) # Rate limiting
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error translating article {i}: {e}")
|
||||
continue
|
||||
|
||||
duration = int(time.time() - start_time)
|
||||
|
||||
from database import log_pipeline_stage
|
||||
log_pipeline_stage(
|
||||
stage='translate',
|
||||
status='completed',
|
||||
articles_processed=len(translated_articles),
|
||||
duration=duration
|
||||
)
|
||||
|
||||
logger.info(f"Translator completed in {duration}s. Articles translated: {len(translated_articles)}")
|
||||
return translated_articles
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Translator failed: {e}")
|
||||
from database import log_pipeline_stage
|
||||
log_pipeline_stage(
|
||||
stage='translate',
|
||||
status='failed',
|
||||
error_message=str(e)
|
||||
)
|
||||
return []
|
||||
|
||||
if __name__ == '__main__':
|
||||
from loguru import logger
|
||||
logger.add(config.LOG_FILE, rotation="1 day")
|
||||
|
||||
# Test translation
|
||||
test_article = {
|
||||
'title': 'OpenAI Releases GPT-5: A New Era of AI',
|
||||
'excerpt': 'OpenAI today announced GPT-5, the next generation of their language model.',
|
||||
'content': '''OpenAI has officially released GPT-5, marking a significant milestone in artificial intelligence development.
|
||||
|
||||
## Key Features
|
||||
|
||||
The new model includes:
|
||||
- 10x more parameters than GPT-4
|
||||
- Better reasoning capabilities
|
||||
- Multimodal support for video
|
||||
- Reduced hallucinations
|
||||
|
||||
CEO Sam Altman said, "GPT-5 represents our most advanced AI system yet."
|
||||
|
||||
The model will be available to ChatGPT Plus subscribers starting next month.'''
|
||||
}
|
||||
|
||||
translator = BurmeseTranslator()
|
||||
translated = translator.translate_article(test_article)
|
||||
|
||||
print("\n=== ORIGINAL ===")
|
||||
print(f"Title: {translated['title']}")
|
||||
print(f"\nContent: {translated['content'][:200]}...")
|
||||
|
||||
print("\n=== BURMESE ===")
|
||||
print(f"Title: {translated['title_burmese']}")
|
||||
print(f"\nContent: {translated['content_burmese'][:200]}...")
|
||||
Reference in New Issue
Block a user