burmddit/database/schema.sql

-- Burmddit Database Schema
-- PostgreSQL

-- Categories table
CREATE TABLE IF NOT EXISTS categories (
    id SERIAL PRIMARY KEY,
    name VARCHAR(100) NOT NULL UNIQUE,
    name_burmese VARCHAR(100) NOT NULL,
    slug VARCHAR(100) NOT NULL UNIQUE,
    description TEXT,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

-- Insert default categories
INSERT INTO categories (name, name_burmese, slug, description) VALUES
('AI News', 'AI သတင်းများ', 'ai-news', 'Latest AI industry news and updates'),
('AI Tutorials', 'AI သင်ခန်းစာများ', 'tutorials', 'Step-by-step guides and how-tos'),
('Tips & Tricks', 'အကြံပြုချက်များ', 'tips-tricks', 'Productivity hacks and best practices'),
('Upcoming Releases', 'လာမည့် ထုတ်ပြန်မှုများ', 'upcoming', 'New AI models, tools, and products')
ON CONFLICT (slug) DO NOTHING;

-- Articles table
CREATE TABLE IF NOT EXISTS articles (
    id SERIAL PRIMARY KEY,
    title TEXT NOT NULL,
    title_burmese TEXT NOT NULL,
    slug VARCHAR(200) NOT NULL UNIQUE,
    content TEXT NOT NULL,
    content_burmese TEXT NOT NULL,
    excerpt TEXT,
    excerpt_burmese TEXT,
    category_id INTEGER REFERENCES categories(id),

    -- Metadata
    author VARCHAR(200) DEFAULT 'Burmddit AI',
    reading_time INTEGER,  -- in minutes
    featured_image TEXT,
    images TEXT[],  -- 🔥 Multiple images
    videos TEXT[],  -- 🔥 Video embeds (YouTube, etc.)

    -- SEO
    meta_description TEXT,
    meta_keywords TEXT[],

    -- Source tracking
    source_articles JSONB,  -- Array of source URLs
    original_sources TEXT[],

    -- Status
    status VARCHAR(20) DEFAULT 'draft',  -- draft, published, archived
    published_at TIMESTAMP,

    -- Analytics
    view_count INTEGER DEFAULT 0,
    share_count INTEGER DEFAULT 0,

    -- Timestamps
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

-- Create indexes
CREATE INDEX idx_articles_slug ON articles(slug);
CREATE INDEX idx_articles_category ON articles(category_id);
CREATE INDEX idx_articles_status ON articles(status);
CREATE INDEX idx_articles_published ON articles(published_at DESC);
CREATE INDEX idx_articles_views ON articles(view_count DESC);

-- Full-text search index (for Burmese content)
CREATE INDEX idx_articles_search ON articles USING gin(to_tsvector('simple', title_burmese || ' ' || content_burmese));

-- Raw scraped articles (before processing)
CREATE TABLE IF NOT EXISTS raw_articles (
    id SERIAL PRIMARY KEY,
    url TEXT NOT NULL UNIQUE,
    title TEXT NOT NULL,
    content TEXT NOT NULL,
    author VARCHAR(200),
    published_date TIMESTAMP,
    source VARCHAR(100),  -- medium, techcrunch, etc
    category_hint VARCHAR(50),  -- detected category

    -- Processing status
    processed BOOLEAN DEFAULT FALSE,
    compiled_into INTEGER REFERENCES articles(id),

    -- Timestamps
    scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

CREATE INDEX idx_raw_articles_processed ON raw_articles(processed);
CREATE INDEX idx_raw_articles_source ON raw_articles(source);

-- Tags table
CREATE TABLE IF NOT EXISTS tags (
    id SERIAL PRIMARY KEY,
    name VARCHAR(100) NOT NULL UNIQUE,
    name_burmese VARCHAR(100),
    slug VARCHAR(100) NOT NULL UNIQUE,
    article_count INTEGER DEFAULT 0,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

-- Article-Tag junction table
CREATE TABLE IF NOT EXISTS article_tags (
    article_id INTEGER REFERENCES articles(id) ON DELETE CASCADE,
    tag_id INTEGER REFERENCES tags(id) ON DELETE CASCADE,
    PRIMARY KEY (article_id, tag_id)
);

-- Analytics tracking
CREATE TABLE IF NOT EXISTS page_views (
    id SERIAL PRIMARY KEY,
    article_id INTEGER REFERENCES articles(id) ON DELETE CASCADE,
    ip_hash VARCHAR(64),  -- Hashed IP for privacy
    user_agent TEXT,
    referrer TEXT,
    country VARCHAR(2),
    viewed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

CREATE INDEX idx_page_views_article ON page_views(article_id);
CREATE INDEX idx_page_views_date ON page_views(viewed_at);

-- Newsletter subscribers
CREATE TABLE IF NOT EXISTS subscribers (
    id SERIAL PRIMARY KEY,
    email VARCHAR(255) NOT NULL UNIQUE,
    status VARCHAR(20) DEFAULT 'active',  -- active, unsubscribed
    subscribed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    unsubscribed_at TIMESTAMP
);

-- Pipeline logs (for monitoring)
CREATE TABLE IF NOT EXISTS pipeline_logs (
    id SERIAL PRIMARY KEY,
    pipeline_run TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    stage VARCHAR(50),  -- crawl, cluster, compile, translate, publish
    status VARCHAR(20),  -- started, completed, failed
    articles_processed INTEGER,
    error_message TEXT,
    duration_seconds INTEGER,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

CREATE INDEX idx_pipeline_logs_run ON pipeline_logs(pipeline_run);

-- Create view for published articles with category info
CREATE OR REPLACE VIEW published_articles AS
SELECT
    a.id,
    a.title,
    a.title_burmese,
    a.slug,
    a.excerpt_burmese,
    a.featured_image,
    a.reading_time,
    a.view_count,
    a.published_at,
    c.name as category_name,
    c.name_burmese as category_name_burmese,
    c.slug as category_slug
FROM articles a
JOIN categories c ON a.category_id = c.id
WHERE a.status = 'published'
ORDER BY a.published_at DESC;

-- Function to update article view count
CREATE OR REPLACE FUNCTION increment_view_count(article_slug VARCHAR)
RETURNS VOID AS $$
BEGIN
    UPDATE articles
    SET view_count = view_count + 1,
        updated_at = CURRENT_TIMESTAMP
    WHERE slug = article_slug;
END;
$$ LANGUAGE plpgsql;

-- Function to get trending articles (last 7 days, by views)
CREATE OR REPLACE FUNCTION get_trending_articles(limit_count INTEGER DEFAULT 10)
RETURNS TABLE (
    id INTEGER,
    title_burmese TEXT,
    slug VARCHAR,
    view_count INTEGER,
    category_name_burmese VARCHAR
) AS $$
BEGIN
    RETURN QUERY
    SELECT
        a.id,
        a.title_burmese,
        a.slug,
        a.view_count,
        c.name_burmese
    FROM articles a
    JOIN categories c ON a.category_id = c.id
    WHERE a.status = 'published'
        AND a.published_at >= CURRENT_TIMESTAMP - INTERVAL '7 days'
    ORDER BY a.view_count DESC
    LIMIT limit_count;
END;
$$ LANGUAGE plpgsql;

-- Function to get related articles (by category and tags)
CREATE OR REPLACE FUNCTION get_related_articles(article_id_param INTEGER, limit_count INTEGER DEFAULT 5)
RETURNS TABLE (
    id INTEGER,
    title_burmese TEXT,
    slug VARCHAR,
    excerpt_burmese TEXT,
    featured_image TEXT
) AS $$
BEGIN
    RETURN QUERY
    SELECT DISTINCT
        a.id,
        a.title_burmese,
        a.slug,
        a.excerpt_burmese,
        a.featured_image
    FROM articles a
    WHERE a.id != article_id_param
        AND a.status = 'published'
        AND (
            a.category_id = (SELECT category_id FROM articles WHERE id = article_id_param)
            OR a.id IN (
                SELECT at2.article_id
                FROM article_tags at1
                JOIN article_tags at2 ON at1.tag_id = at2.tag_id
                WHERE at1.article_id = article_id_param
                    AND at2.article_id != article_id_param
            )
        )
    ORDER BY a.published_at DESC
    LIMIT limit_count;
END;
$$ LANGUAGE plpgsql;

-- Trigger to update updated_at timestamp
CREATE OR REPLACE FUNCTION update_updated_at_column()
RETURNS TRIGGER AS $$
BEGIN
    NEW.updated_at = CURRENT_TIMESTAMP;
    RETURN NEW;
END;
$$ LANGUAGE plpgsql;

CREATE TRIGGER update_articles_updated_at
BEFORE UPDATE ON articles
FOR EACH ROW
EXECUTE FUNCTION update_updated_at_column();

-- Initial data: Some common tags
INSERT INTO tags (name, name_burmese, slug) VALUES
('ChatGPT', 'ChatGPT', 'chatgpt'),
('OpenAI', 'OpenAI', 'openai'),
('Anthropic', 'Anthropic', 'anthropic'),
('Google', 'Google', 'google'),
('Machine Learning', 'စက်သင်ယူမှု', 'machine-learning'),
('Deep Learning', 'နက်ရှိုင်းသောသင်ယူမှု', 'deep-learning'),
('GPT-4', 'GPT-4', 'gpt-4'),
('Claude', 'Claude', 'claude'),
('Prompt Engineering', 'Prompt Engineering', 'prompt-engineering'),
('AI Safety', 'AI ဘေးကင်းရေး', 'ai-safety')
ON CONFLICT (slug) DO NOTHING;