#!/usr/bin/env python3
"""
Process Wiktionary JSONL data and load into SQLite database.
Source: kaikki.org Italian dictionary export
"""

import json
import sqlite3
import sys
from pathlib import Path

DB_PATH = Path(__file__).parent / "italian.db"
JSONL_PATH = Path(__file__).parent / "italian-dict.jsonl"

# Map Wiktionary POS tags to our categories
POS_MAP = {
    "verb": "verbs",
    "noun": "nouns",
    "adj": "adjectives",
    "adv": "adverbs",
    "phrase": "phrases",
    "intj": "phrases",
    "prep": "phrases",
    "conj": "phrases",
    "pron": "nouns",
    "num": "nouns",
    "det": "adjectives",
    "particle": "phrases",
    "suffix": "phrases",
    "prefix": "phrases",
    "affix": "phrases",
    "name": "nouns",
}

def create_database():
    """Create SQLite database with schema."""
    conn = sqlite3.connect(DB_PATH)
    cur = conn.cursor()

    cur.execute("DROP TABLE IF EXISTS vocabulary")
    cur.execute("""
        CREATE TABLE vocabulary (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            italian TEXT NOT NULL,
            english TEXT NOT NULL,
            category TEXT NOT NULL,
            notes TEXT DEFAULT ''
        )
    """)

    # Create indexes for fast lookups
    cur.execute("CREATE INDEX idx_italian ON vocabulary(italian)")
    cur.execute("CREATE INDEX idx_category ON vocabulary(category)")
    cur.execute("CREATE INDEX idx_english ON vocabulary(english)")

    # Full-text search table
    cur.execute("""
        CREATE VIRTUAL TABLE vocabulary_fts USING fts5(
            italian, english, notes,
            content='vocabulary',
            content_rowid='id'
        )
    """)

    # Triggers to keep FTS in sync
    cur.execute("""
        CREATE TRIGGER vocabulary_ai AFTER INSERT ON vocabulary BEGIN
            INSERT INTO vocabulary_fts(rowid, italian, english, notes)
            VALUES (new.id, new.italian, new.english, new.notes);
        END
    """)

    conn.commit()
    return conn

def extract_english_translations(senses):
    """Extract English translations from sense data."""
    translations = []
    for sense in senses:
        # Get glosses (English definitions)
        if "glosses" in sense:
            for gloss in sense["glosses"]:
                if gloss and isinstance(gloss, str):
                    # Skip if it's just a form reference
                    if not gloss.startswith("inflection of") and not gloss.startswith("plural of"):
                        translations.append(gloss)

        # Also check raw_glosses
        if "raw_glosses" in sense:
            for gloss in sense["raw_glosses"]:
                if gloss and isinstance(gloss, str) and gloss not in translations:
                    if not gloss.startswith("inflection of") and not gloss.startswith("plural of"):
                        translations.append(gloss)

    return translations

def process_jsonl(conn, max_entries=100000):
    """Process JSONL file and insert into database."""
    cur = conn.cursor()

    seen_words = set()  # Track unique italian + category combinations
    count = 0
    skipped = 0

    print(f"Processing {JSONL_PATH}...")

    with open(JSONL_PATH, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            if count >= max_entries:
                break

            if line_num % 50000 == 0:
                print(f"  Processed {line_num} lines, {count} entries added...")
                conn.commit()

            try:
                entry = json.loads(line.strip())
            except json.JSONDecodeError:
                skipped += 1
                continue

            # Get word
            word = entry.get("word", "").strip()
            if not word:
                skipped += 1
                continue

            # Get part of speech and map to category
            pos = entry.get("pos", "").lower()
            category = POS_MAP.get(pos, "phrases")

            # Skip if we've seen this word+category combo
            key = (word.lower(), category)
            if key in seen_words:
                skipped += 1
                continue

            # Extract English translations
            senses = entry.get("senses", [])
            translations = extract_english_translations(senses)

            if not translations:
                skipped += 1
                continue

            # Use first translation as primary, rest as notes
            english = translations[0]
            notes = "; ".join(translations[1:3]) if len(translations) > 1 else ""

            # Truncate if too long
            if len(english) > 200:
                english = english[:197] + "..."
            if len(notes) > 300:
                notes = notes[:297] + "..."

            # Insert into database
            cur.execute(
                "INSERT INTO vocabulary (italian, english, category, notes) VALUES (?, ?, ?, ?)",
                (word, english, category, notes)
            )

            seen_words.add(key)
            count += 1

    conn.commit()
    print(f"Done! Added {count} entries, skipped {skipped}")
    return count

def main():
    if not JSONL_PATH.exists():
        print(f"Error: {JSONL_PATH} not found")
        print("Download from: https://kaikki.org/dictionary/Italian/kaikki.org-dictionary-Italian.jsonl")
        sys.exit(1)

    print("Creating database...")
    conn = create_database()

    print("Processing vocabulary...")
    count = process_jsonl(conn)

    # Print stats
    cur = conn.cursor()
    cur.execute("SELECT category, COUNT(*) FROM vocabulary GROUP BY category ORDER BY COUNT(*) DESC")
    print("\nCategory breakdown:")
    for cat, cnt in cur.fetchall():
        print(f"  {cat}: {cnt}")

    conn.close()
    print(f"\nDatabase saved to {DB_PATH}")

if __name__ == "__main__":
    main()
