"""Extract and compare language-learning vocabulary."""

from __future__ import annotations

import logging
import os
from collections import Counter
from typing import Callable

import regex as re

from .ankiconnect import anki_request
from .config import Config
from .text import extract_first_visible_line, extract_visible_text, normalize_word_key

JAPANESE_CHAR_RE = re.compile(r"[\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Han}ー]+")
JAPANESE_PARTICLES = {
    "は", "が", "を", "に", "へ", "で", "と", "や", "も", "から", "まで", "より", "ば", "なら",
    "の", "ね", "よ", "ぞ", "ぜ", "さ", "わ", "か", "な", "って", "とき", "ってば", "けど", "けれど",
    "しかし", "でも", "ながら", "ほど", "し", "もの", "こと", "ところ", "よう", "らしい", "られる",
}
JAPANESE_GRAMMAR_EXCLUDE = {
    "て", "た", "ます", "れる", "てる", "ぬ", "ん", "しまう", "いる", "ない", "なる", "ある", "だ", "です",
}
JAPANESE_ALLOWED_POS = {"NOUN", "PROPN", "VERB", "ADJ"}


def setup_logging(logfile: str) -> None:
    os.makedirs(os.path.dirname(os.path.abspath(logfile)), exist_ok=True)
    logging.basicConfig(filename=logfile, level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")


def build_query_from_decks(decks: list[str]) -> str:
    return " OR ".join(f'deck:"{d}"' for d in decks)


def japanese_filter(token) -> bool:
    text = (token.text or "").strip()
    lemma = (token.lemma_ or "").strip()
    if not text or not JAPANESE_CHAR_RE.fullmatch(text):
        return False
    if lemma in JAPANESE_GRAMMAR_EXCLUDE or text in JAPANESE_PARTICLES:
        return False
    if getattr(token, "pos_", None) not in JAPANESE_ALLOWED_POS:
        return False
    if getattr(token, "is_stop", False) or getattr(token, "like_url", False) or getattr(token, "like_email", False):
        return False
    if any(c in text for c in "<>=/\\:&%"):
        return False
    return text not in {"ruby", "rt", "div", "br", "nbsp", "href", "strong", "a"}


def spanish_filter(token) -> bool:
    return bool(getattr(token, "is_alpha", False)) and not bool(getattr(token, "is_stop", False))


def spanish_format(token) -> str:
    return (token.lemma_ or token.text or "").lower().strip()


def japanese_format(token) -> str:
    lemma = (token.lemma_ or "").strip()
    surface = (token.text or "").strip()
    if lemma and surface and lemma != surface:
        return f"{lemma} ({surface})"
    return lemma or surface


LANGUAGE_PROFILES = {
    "spanish": {"token_filter": spanish_filter, "output_format": spanish_format},
    "japanese": {"token_filter": japanese_filter, "output_format": japanese_format},
}


def load_spacy_model(model_name: str):
    try:
        import spacy  # type: ignore
    except Exception as e:
        raise RuntimeError("Failed to import spaCy. Use a Python version supported by spaCy.") from e
    try:
        return spacy.load(model_name)
    except Exception as e:
        raise RuntimeError(f"Failed to load spaCy model '{model_name}'. Try: python -m spacy download {model_name}") from e


def get_notes(query: str, config: Config, request: Callable = anki_request) -> list[dict]:
    note_ids = request("findNotes", url=config.anki_connect_url, query=query) or []
    if not note_ids:
        return []
    return request("notesInfo", url=config.anki_connect_url, notes=note_ids) or []


def extract_counts(
    notes: list[dict],
    field_name: str,
    nlp,
    token_filter: Callable,
    output_format: Callable,
    use_full_field: bool,
) -> Counter:
    counter: Counter = Counter()
    for note in notes:
        fields = note.get("fields", {}) or {}
        raw_val = (fields.get(field_name, {}) or {}).get("value", "") or ""
        text = extract_visible_text(raw_val) if use_full_field else extract_first_visible_line(raw_val)
        if not text:
            continue
        for token in nlp(text):
            if token_filter(token):
                key = output_format(token)
                if key:
                    counter[key] += 1
    return counter


def write_counts(counter: Counter, out_path: str, min_freq: int) -> int:
    items = [(w, c) for (w, c) in counter.items() if c >= min_freq]
    items.sort(key=lambda x: (-x[1], x[0]))
    os.makedirs(os.path.dirname(os.path.abspath(out_path)), exist_ok=True)
    with open(out_path, "w", encoding="utf-8") as f:
        for word, freq in items:
            f.write(f"{word} {freq}\n")
    return len(items)


def read_word_file(path: str) -> set[str]:
    words: set[str] = set()
    with open(os.path.expanduser(path), "r", encoding="utf-8") as fh:
        for line in fh:
            stripped = line.strip()
            if not stripped:
                continue
            word = stripped.rsplit(" ", 1)[0]
            words.add(normalize_word_key(word))
    return words


def compare_word_files(source_path: str, known_path: str) -> list[str]:
    known = read_word_file(known_path)
    new_words: list[str] = []
    with open(os.path.expanduser(source_path), "r", encoding="utf-8") as fh:
        for line in fh:
            stripped = line.strip()
            if not stripped:
                continue
            word = stripped.rsplit(" ", 1)[0]
            if normalize_word_key(word) not in known:
                new_words.append(stripped)
    return new_words


def extract_words(
    config: Config,
    lang: str,
    query: str | None = None,
    decks: list[str] | None = None,
    field: str | None = None,
    min_freq: int = 2,
    outdir: str | None = None,
    out: str | None = None,
    full_field: bool = False,
    spacy_model: str | None = None,
    request: Callable = anki_request,
) -> dict[str, object]:
    language_bucket = config.language_name(lang)
    profile = LANGUAGE_PROFILES[language_bucket]
    search_query = query or build_query_from_decks(decks or config.decks_for(lang))
    out_dir = os.path.expanduser(outdir) if outdir else os.path.join(config.word_output_root, language_bucket)
    out_path = os.path.expanduser(out) if out else os.path.join(out_dir, f"words_{lang}.txt")
    model_name = spacy_model or str(config.language(lang).get("word_model"))
    nlp = load_spacy_model(model_name)
    notes = get_notes(search_query, config, request=request)
    if notes:
        fields0 = (notes[0].get("fields", {}) or {})
        field_name = field or config.field_for(lang)
        if field_name not in fields0:
            raise RuntimeError(f"Field '{field_name}' not found. Available fields: {list(fields0.keys())}")
    else:
        field_name = field or config.field_for(lang)
    counter = extract_counts(notes, field_name, nlp, profile["token_filter"], profile["output_format"], full_field)
    written = write_counts(counter, out_path, min_freq)
    return {"query": search_query, "notes": len(notes), "unique": len(counter), "written": written, "out": out_path}