Rename project to Saiki and unify CLI

2026-05-26 18:09:26 -04:00
parent 8ee1f8de25
commit f38030238c
19 changed files with 1274 additions and 1326 deletions
@@ -0,0 +1,2 @@
+"""Utilities for Anki-based language learning workflows."""
+
@@ -0,0 +1,19 @@
+"""Small AnkiConnect client."""
+
+from __future__ import annotations
+
+import requests
+
+
+def anki_request(action: str, url: str = "http://localhost:8765", **params):
+    resp = requests.post(
+        url,
+        json={"action": action, "version": 6, "params": params},
+        timeout=30,
+    )
+    resp.raise_for_status()
+    data = resp.json()
+    if data.get("error") is not None:
+        raise RuntimeError(f"AnkiConnect error for {action}: {data['error']}")
+    return data["result"]
+
@@ -0,0 +1,126 @@
+"""Extract Anki audio media into playlists."""
+
+from __future__ import annotations
+
+import os
+import re
+import shutil
+import subprocess
+import tempfile
+from typing import Callable
+
+from .ankiconnect import anki_request
+from .config import Config
+
+AUDIO_EXTS = (".mp3", ".wav", ".ogg", ".m4a", ".flac")
+
+
+def resolve_media_paths(media_dir: str, out_dir: str, media_name: str) -> tuple[str, str] | None:
+    normalized = os.path.normpath(media_name)
+    if os.path.isabs(normalized) or normalized.startswith(".."):
+        return None
+    return os.path.join(media_dir, normalized), os.path.join(out_dir, normalized)
+
+
+def build_playlist(out_dir: str, language: str) -> str:
+    m3u_path = os.path.join(out_dir, f"{language}.m3u")
+    concat_name = f"{language}_concat.mp3"
+    files: list[str] = []
+    for root, _, filenames in os.walk(out_dir):
+        for fname in filenames:
+            abs_path = os.path.join(root, fname)
+            rel_path = os.path.relpath(abs_path, out_dir)
+            if rel_path in {os.path.basename(m3u_path), concat_name}:
+                continue
+            if fname.lower().endswith(AUDIO_EXTS) and os.path.isfile(abs_path):
+                files.append(rel_path)
+
+    with open(m3u_path, "w", encoding="utf-8") as fh:
+        for fname in sorted(files):
+            fh.write(f"{fname}\n")
+    return m3u_path
+
+
+def concat_audio_from_m3u(out_dir: str, m3u_path: str, out_path: str) -> None:
+    if shutil.which("ffmpeg") is None:
+        raise RuntimeError("ffmpeg not found in PATH. Install ffmpeg to use --concat.")
+
+    with open(m3u_path, "r", encoding="utf-8") as fh:
+        rel_files = [line.strip() for line in fh if line.strip()]
+
+    abs_files = [
+        os.path.abspath(os.path.join(out_dir, rel))
+        for rel in rel_files
+        if os.path.isfile(os.path.join(out_dir, rel)) and rel.lower().endswith(AUDIO_EXTS)
+    ]
+    if not abs_files:
+        raise RuntimeError("No audio files found to concatenate.")
+
+    with tempfile.NamedTemporaryFile("w", delete=False, encoding="utf-8") as tmp:
+        concat_list_path = tmp.name
+        for path in abs_files:
+            tmp.write(f"file '{path.replace(chr(39), chr(39) + chr(92) + chr(39) + chr(39))}'\n")
+
+    cmd = [
+        "ffmpeg", "-hide_banner", "-loglevel", "error", "-f", "concat", "-safe", "0",
+        "-i", concat_list_path, "-c:a", "libmp3lame", "-q:a", "4", "-y", out_path,
+    ]
+    try:
+        subprocess.run(cmd, check=True)
+    finally:
+        try:
+            os.remove(concat_list_path)
+        except OSError:
+            pass
+
+
+def extract_audio(
+    config: Config,
+    lang: str,
+    outdir: str | None = None,
+    media_dir: str | None = None,
+    copy_only_new: bool = False,
+    concat: bool = False,
+    request: Callable = anki_request,
+) -> dict[str, object]:
+    language = config.language_name(lang)
+    selected_decks = config.decks_for(lang)
+    if not selected_decks:
+        raise RuntimeError(f"No decks configured for language: {lang}")
+
+    media_root = media_dir or config.media_dir
+    out_dir = os.path.expanduser(outdir) if outdir else os.path.join(config.audio_output_root, language)
+    os.makedirs(out_dir, exist_ok=True)
+
+    all_ids: list[int] = []
+    for deck in selected_decks:
+        all_ids.extend(request("findNotes", url=config.anki_connect_url, query=f'deck:"{deck}"') or [])
+
+    if not all_ids:
+        return {"copied": 0, "playlist": build_playlist(out_dir, language), "outdir": out_dir, "concat": None}
+
+    notes = request("notesInfo", url=config.anki_connect_url, notes=all_ids) or []
+    copied: list[str] = []
+    for note in notes:
+        for field in (note.get("fields", {}) or {}).values():
+            val = field.get("value", "") or ""
+            for match in re.findall(r"\[sound:(.+?)\]", val):
+                paths = resolve_media_paths(media_root, out_dir, match)
+                if paths is None:
+                    continue
+                src, dst = paths
+                if not os.path.exists(src):
+                    continue
+                os.makedirs(os.path.dirname(dst), exist_ok=True)
+                if copy_only_new and os.path.exists(dst):
+                    continue
+                shutil.copy2(src, dst)
+                copied.append(match)
+
+    m3u_path = build_playlist(out_dir, language)
+    concat_path = None
+    if concat:
+        concat_path = os.path.join(out_dir, f"{language}_concat.mp3")
+        concat_audio_from_m3u(out_dir, m3u_path, concat_path)
+    return {"copied": len(copied), "playlist": m3u_path, "outdir": out_dir, "concat": concat_path}
+
@@ -0,0 +1,126 @@
+"""Unified command-line interface for Saiki."""
+
+from __future__ import annotations
+
+import argparse
+import sys
+
+from .audio import extract_audio
+from .config import Config, language_choices, load_config
+from .importer import import_sentences
+from .words import compare_word_files, extract_words
+from .youtube import run_youtube
+
+
+def add_config_arg(parser: argparse.ArgumentParser) -> None:
+    parser.add_argument("--config", help="Path to YAML config file.")
+
+
+def build_parser(config: Config | None = None) -> argparse.ArgumentParser:
+    choices = language_choices(config or load_config())
+    parser = argparse.ArgumentParser(description="Saiki: sentence mining and listening tools for Anki.")
+    add_config_arg(parser)
+    sub = parser.add_subparsers(dest="command", required=True)
+
+    audio = sub.add_parser("audio", help="Extract Anki audio into playlists.")
+    audio.add_argument("lang", choices=choices)
+    audio.add_argument("--concat", action="store_true")
+    audio.add_argument("--outdir")
+    audio.add_argument("--media-dir")
+    audio.add_argument("--copy-only-new", action="store_true")
+
+    words = sub.add_parser("words", help="Extract frequent words from Anki.")
+    words.add_argument("lang", choices=choices)
+    group = words.add_mutually_exclusive_group()
+    group.add_argument("--query")
+    group.add_argument("--deck", action="append")
+    words.add_argument("--field")
+    words.add_argument("--min-freq", type=int, default=2)
+    words.add_argument("--outdir")
+    words.add_argument("--out")
+    words.add_argument("--full-field", action="store_true")
+    words.add_argument("--spacy-model")
+
+    compare = sub.add_parser("compare-words", help="Print words in source that are not in known.")
+    compare.add_argument("source")
+    compare.add_argument("known")
+
+    youtube = sub.add_parser("youtube", help="Mine a YouTube transcript.")
+    youtube.add_argument("lang", choices=choices)
+    youtube.add_argument("video")
+    youtube.add_argument("--mode", choices=["vocab", "sentences"], default="vocab")
+    youtube.add_argument("--top", type=int)
+    youtube.add_argument("--no-stopwords", action="store_true")
+    youtube.add_argument("--raw", action="store_true")
+    youtube.add_argument("--out")
+    youtube.add_argument("--format", choices=["tsv", "csv"], default="tsv")
+    youtube.add_argument("--known-words", help="Word list to filter vocab_guess against.")
+    youtube.add_argument("--only-new", action="store_true", help="Only export sentences with unknown vocab.")
+
+    importer = sub.add_parser("import", help="Generate TTS and import sentence cards.")
+    importer.add_argument("lang", choices=choices)
+    importer.add_argument("sentence_file", nargs="?")
+    importer.add_argument("--tags", help="Comma-separated tags. text-to-speech is always included.")
+
+    return parser
+
+
+def main(argv: list[str] | None = None) -> int:
+    pre = argparse.ArgumentParser(add_help=False)
+    add_config_arg(pre)
+    known, _ = pre.parse_known_args(argv)
+    config = load_config(known.config)
+    parser = build_parser(config)
+    args = parser.parse_args(argv)
+
+    if args.command == "audio":
+        result = extract_audio(config, args.lang, args.outdir, args.media_dir, args.copy_only_new, args.concat)
+        print(f"Copied {result['copied']} files")
+        print(f"Playlist: {result['playlist']}")
+        print(f"Output directory: {result['outdir']}")
+        if result["concat"]:
+            print(f"Concatenated file: {result['concat']}")
+        return 0
+
+    if args.command == "words":
+        result = extract_words(
+            config, args.lang, args.query, args.deck, args.field, args.min_freq,
+            args.outdir, args.out, args.full_field, args.spacy_model,
+        )
+        print(f"Query: {result['query']}")
+        print(f"Found {result['notes']} notes")
+        print(f"Extracted {result['unique']} unique entries")
+        print(f"Wrote {result['written']} entries to: {result['out']}")
+        return 0
+
+    if args.command == "compare-words":
+        for line in compare_word_files(args.source, args.known):
+            print(line)
+        return 0
+
+    if args.command == "youtube":
+        result = run_youtube(
+            config, args.lang, args.video, args.mode, args.top, args.no_stopwords,
+            args.raw, args.out, args.format, args.known_words, args.only_new,
+        )
+        if args.mode == "sentences" and not args.out:
+            for line in result["lines"]:
+                print(f"[{line.start:.2f}s] {line.text}")
+        elif args.mode == "sentences":
+            print(f"Wrote {result['written']} rows to: {result['out']}")
+        else:
+            for word, count in result["items"]:
+                print(f"{word}: {count}")
+        return 0
+
+    if args.command == "import":
+        result = import_sentences(config, args.lang, args.sentence_file, args.tags)
+        print(f"Done. Added {result.added}/{result.processed} cards. Failed: {result.failed}")
+        return 0 if result.failed == 0 else 1
+
+    parser.print_help()
+    return 2
+
+
+if __name__ == "__main__":
+    raise SystemExit(main(sys.argv[1:]))
@@ -0,0 +1,148 @@
+"""Configuration loading for Saiki.
+
+Defaults mirror the original scripts. Users can override them with YAML at
+~/.config/saiki/config.yaml or by passing --config to the CLI.
+"""
+
+from __future__ import annotations
+
+import copy
+import os
+from dataclasses import dataclass
+from typing import Any
+
+try:
+    import yaml
+except Exception:  # pragma: no cover - handled when config files are loaded
+    yaml = None
+
+
+DEFAULT_CONFIG: dict[str, Any] = {
+    "anki_connect_url": "http://localhost:8765",
+    "media_dir": "~/.var/app/net.ankiweb.Anki/data/Anki2/User 1/collection.media",
+    "audio_output_root": "~/Languages/Anki/anki-audio",
+    "word_output_root": "~/Languages/Anki/anki-words",
+    "sentence_dir": "~/Languages/Anki",
+    "note_model": "Basic",
+    "fields": {"front": "Front", "back": "Back"},
+    "languages": {
+        "jp": {
+            "name": "japanese",
+            "transcript_code": "ja",
+            "tts_code": "ja",
+            "tts_tld": "com",
+            "tts_tempo": 1.35,
+            "decks": ["日本語"],
+            "word_model": "ja_core_news_lg",
+            "field": "Back",
+            "sentence_file": "sentences_jp.txt",
+        },
+        "es": {
+            "name": "spanish",
+            "transcript_code": "es",
+            "tts_code": "es",
+            "tts_tld": "es",
+            "tts_tempo": 1.25,
+            "decks": ["Español"],
+            "word_model": "es_core_news_sm",
+            "field": "Back",
+            "sentence_file": "sentences_es.txt",
+        },
+    },
+}
+
+
+@dataclass(frozen=True)
+class Config:
+    data: dict[str, Any]
+
+    @property
+    def anki_connect_url(self) -> str:
+        return str(self.data["anki_connect_url"])
+
+    @property
+    def media_dir(self) -> str:
+        return expand_path(str(self.data["media_dir"]))
+
+    @property
+    def audio_output_root(self) -> str:
+        return expand_path(str(self.data["audio_output_root"]))
+
+    @property
+    def word_output_root(self) -> str:
+        return expand_path(str(self.data["word_output_root"]))
+
+    @property
+    def sentence_dir(self) -> str:
+        return expand_path(str(self.data["sentence_dir"]))
+
+    @property
+    def note_model(self) -> str:
+        return str(self.data.get("note_model", "Basic"))
+
+    @property
+    def fields(self) -> dict[str, str]:
+        return dict(self.data.get("fields", {}))
+
+    @property
+    def languages(self) -> dict[str, dict[str, Any]]:
+        return dict(self.data.get("languages", {}))
+
+    def language(self, lang: str) -> dict[str, Any]:
+        try:
+            return dict(self.languages[lang])
+        except KeyError as e:
+            available = ", ".join(sorted(self.languages))
+            raise ValueError(f"Unsupported language '{lang}'. Available: {available}") from e
+
+    def language_name(self, lang: str) -> str:
+        return str(self.language(lang)["name"])
+
+    def transcript_code(self, lang: str) -> str:
+        return str(self.language(lang)["transcript_code"])
+
+    def decks_for(self, lang: str) -> list[str]:
+        return list(self.language(lang).get("decks", []))
+
+    def field_for(self, lang: str) -> str:
+        return str(self.language(lang).get("field", self.fields.get("back", "Back")))
+
+    def sentence_file_for(self, lang: str) -> str:
+        value = str(self.language(lang).get("sentence_file", f"sentences_{lang}.txt"))
+        return expand_path(value if os.path.isabs(value) or value.startswith("~") else os.path.join(self.sentence_dir, value))
+
+
+def expand_path(path: str) -> str:
+    return os.path.expanduser(os.path.expandvars(path))
+
+
+def default_config_path() -> str:
+    return expand_path("~/.config/saiki/config.yaml")
+
+
+def deep_merge(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
+    result = copy.deepcopy(base)
+    for key, value in override.items():
+        if isinstance(value, dict) and isinstance(result.get(key), dict):
+            result[key] = deep_merge(result[key], value)
+        else:
+            result[key] = value
+    return result
+
+
+def load_config(path: str | None = None) -> Config:
+    config = copy.deepcopy(DEFAULT_CONFIG)
+    config_path = expand_path(path) if path else default_config_path()
+    if os.path.exists(config_path):
+        if yaml is None:
+            raise RuntimeError("Loading config files requires PyYAML. Install pyyaml.")
+        with open(config_path, "r", encoding="utf-8") as fh:
+            loaded = yaml.safe_load(fh) or {}
+        if not isinstance(loaded, dict):
+            raise RuntimeError(f"Config must be a YAML mapping: {config_path}")
+        config = deep_merge(config, loaded)
+    return Config(config)
+
+
+def language_choices(config: Config) -> list[str]:
+    return sorted(config.languages.keys())
@@ -0,0 +1,112 @@
+"""Generate TTS audio and add sentence notes to Anki."""
+
+from __future__ import annotations
+
+import os
+import csv
+import shutil
+import subprocess
+import tempfile
+import time
+from dataclasses import dataclass
+from typing import Callable
+
+from .ankiconnect import anki_request
+from .config import Config
+
+
+@dataclass(frozen=True)
+class ImportResult:
+    processed: int
+    added: int
+    failed: int
+
+
+def parse_tags(value: str | None) -> list[str]:
+    tags = ["text-to-speech"]
+    if value:
+        tags.extend(tag.strip() for tag in value.split(",") if tag.strip())
+    else:
+        tags.append("AI-generated")
+    return tags
+
+
+def require_command(name: str) -> None:
+    if shutil.which(name) is None:
+        raise RuntimeError(f"Required command not found: {name}")
+
+
+def generate_tts(sentence: str, raw_output: str, lang_code: str, tld: str) -> None:
+    subprocess.run(["gtts-cli", sentence, "--lang", lang_code, "--tld", tld, "--output", raw_output], check=True)
+
+
+def speed_audio(raw_output: str, output_path: str, tempo: float) -> None:
+    subprocess.run(
+        ["ffmpeg", "-loglevel", "error", "-i", raw_output, "-filter:a", f"atempo={tempo}", "-y", output_path],
+        stdin=subprocess.DEVNULL,
+        check=True,
+    )
+
+
+def read_sentences(path: str) -> list[str]:
+    expanded = os.path.expanduser(path)
+    if expanded.lower().endswith((".tsv", ".csv")):
+        delimiter = "\t" if expanded.lower().endswith(".tsv") else ","
+        with open(expanded, "r", encoding="utf-8", newline="") as fh:
+            reader = csv.DictReader(fh, delimiter=delimiter)
+            if reader.fieldnames and "sentence" in reader.fieldnames:
+                return [row["sentence"].strip() for row in reader if row.get("sentence", "").strip()]
+        raise RuntimeError("TSV/CSV sentence imports must include a 'sentence' header.")
+
+    with open(expanded, "r", encoding="utf-8") as fh:
+        return [line.strip() for line in fh if line.strip()]
+
+
+def import_sentences(
+    config: Config,
+    lang: str,
+    sentence_file: str | None = None,
+    tags_value: str | None = None,
+    request: Callable = anki_request,
+) -> ImportResult:
+    require_command("gtts-cli")
+    require_command("ffmpeg")
+
+    language = config.language(lang)
+    decks = list(language.get("decks", []))
+    if not decks:
+        raise RuntimeError(f"No deck configured for language: {lang}")
+    deck = decks[0]
+
+    source = os.path.expanduser(sentence_file) if sentence_file else config.sentence_file_for(lang)
+    sentences = read_sentences(source)
+    tags = parse_tags(tags_value)
+    front_field = config.fields.get("front", "Front")
+    back_field = config.fields.get("back", "Back")
+    added = 0
+    failed = 0
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        for sentence in sentences:
+            basename = f"tts_{time.strftime('%Y%m%d_%H%M%S')}_{lang}_{os.getpid()}_{added + failed}"
+            raw_output = os.path.join(temp_dir, f"{basename}_original.mp3")
+            output_path = os.path.join(temp_dir, f"{basename}.mp3")
+            try:
+                generate_tts(sentence, raw_output, str(language["tts_code"]), str(language["tts_tld"]))
+                speed_audio(raw_output, output_path, float(language["tts_tempo"]))
+                request(
+                    "addNote",
+                    url=config.anki_connect_url,
+                    note={
+                        "deckName": deck,
+                        "modelName": config.note_model,
+                        "fields": {front_field: "", back_field: sentence},
+                        "options": {"allowDuplicate": False},
+                        "tags": tags,
+                        "audio": [{"path": output_path, "filename": f"{basename}.mp3", "fields": [front_field]}],
+                    },
+                )
+                added += 1
+            except Exception:
+                failed += 1
+    return ImportResult(processed=len(sentences), added=added, failed=failed)
@@ -0,0 +1,29 @@
+"""Text cleanup helpers shared by tools."""
+
+from __future__ import annotations
+
+from html import unescape
+
+import regex as re
+
+
+def extract_first_visible_line(text: str) -> str:
+    text = unescape(text or "")
+    text = re.sub(r"</?(br|div|p)[^>]*>", "\n", text, flags=re.IGNORECASE)
+    text = re.sub(r"<[^>]+>", "", text)
+    text = text.strip()
+    return text.splitlines()[0] if text else ""
+
+
+def extract_visible_text(text: str) -> str:
+    text = unescape(text or "")
+    text = re.sub(r"</?(br|div|p)[^>]*>", "\n", text, flags=re.IGNORECASE)
+    text = re.sub(r"<[^>]+>", "", text)
+    text = re.sub(r"[ \t]+", " ", text)
+    text = re.sub(r"\n{2,}", "\n", text)
+    return text.strip()
+
+
+def normalize_word_key(value: str) -> str:
+    return re.sub(r"\s+", " ", value.strip().lower())
+
@@ -0,0 +1,183 @@
+"""Extract and compare language-learning vocabulary."""
+
+from __future__ import annotations
+
+import logging
+import os
+from collections import Counter
+from typing import Callable
+
+import regex as re
+
+from .ankiconnect import anki_request
+from .config import Config
+from .text import extract_first_visible_line, extract_visible_text, normalize_word_key
+
+JAPANESE_CHAR_RE = re.compile(r"[\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Han}ー]+")
+JAPANESE_PARTICLES = {
+    "は", "が", "を", "に", "へ", "で", "と", "や", "も", "から", "まで", "より", "ば", "なら",
+    "の", "ね", "よ", "ぞ", "ぜ", "さ", "わ", "か", "な", "って", "とき", "ってば", "けど", "けれど",
+    "しかし", "でも", "ながら", "ほど", "し", "もの", "こと", "ところ", "よう", "らしい", "られる",
+}
+JAPANESE_GRAMMAR_EXCLUDE = {
+    "て", "た", "ます", "れる", "てる", "ぬ", "ん", "しまう", "いる", "ない", "なる", "ある", "だ", "です",
+}
+JAPANESE_ALLOWED_POS = {"NOUN", "PROPN", "VERB", "ADJ"}
+
+
+def setup_logging(logfile: str) -> None:
+    os.makedirs(os.path.dirname(os.path.abspath(logfile)), exist_ok=True)
+    logging.basicConfig(filename=logfile, level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
+
+
+def build_query_from_decks(decks: list[str]) -> str:
+    return " OR ".join(f'deck:"{d}"' for d in decks)
+
+
+def japanese_filter(token) -> bool:
+    text = (token.text or "").strip()
+    lemma = (token.lemma_ or "").strip()
+    if not text or not JAPANESE_CHAR_RE.fullmatch(text):
+        return False
+    if lemma in JAPANESE_GRAMMAR_EXCLUDE or text in JAPANESE_PARTICLES:
+        return False
+    if getattr(token, "pos_", None) not in JAPANESE_ALLOWED_POS:
+        return False
+    if getattr(token, "is_stop", False) or getattr(token, "like_url", False) or getattr(token, "like_email", False):
+        return False
+    if any(c in text for c in "<>=/\\:&%"):
+        return False
+    return text not in {"ruby", "rt", "div", "br", "nbsp", "href", "strong", "a"}
+
+
+def spanish_filter(token) -> bool:
+    return bool(getattr(token, "is_alpha", False)) and not bool(getattr(token, "is_stop", False))
+
+
+def spanish_format(token) -> str:
+    return (token.lemma_ or token.text or "").lower().strip()
+
+
+def japanese_format(token) -> str:
+    lemma = (token.lemma_ or "").strip()
+    surface = (token.text or "").strip()
+    if lemma and surface and lemma != surface:
+        return f"{lemma} ({surface})"
+    return lemma or surface
+
+
+LANGUAGE_PROFILES = {
+    "spanish": {"token_filter": spanish_filter, "output_format": spanish_format},
+    "japanese": {"token_filter": japanese_filter, "output_format": japanese_format},
+}
+
+
+def load_spacy_model(model_name: str):
+    try:
+        import spacy  # type: ignore
+    except Exception as e:
+        raise RuntimeError("Failed to import spaCy. Use a Python version supported by spaCy.") from e
+    try:
+        return spacy.load(model_name)
+    except Exception as e:
+        raise RuntimeError(f"Failed to load spaCy model '{model_name}'. Try: python -m spacy download {model_name}") from e
+
+
+def get_notes(query: str, config: Config, request: Callable = anki_request) -> list[dict]:
+    note_ids = request("findNotes", url=config.anki_connect_url, query=query) or []
+    if not note_ids:
+        return []
+    return request("notesInfo", url=config.anki_connect_url, notes=note_ids) or []
+
+
+def extract_counts(
+    notes: list[dict],
+    field_name: str,
+    nlp,
+    token_filter: Callable,
+    output_format: Callable,
+    use_full_field: bool,
+) -> Counter:
+    counter: Counter = Counter()
+    for note in notes:
+        fields = note.get("fields", {}) or {}
+        raw_val = (fields.get(field_name, {}) or {}).get("value", "") or ""
+        text = extract_visible_text(raw_val) if use_full_field else extract_first_visible_line(raw_val)
+        if not text:
+            continue
+        for token in nlp(text):
+            if token_filter(token):
+                key = output_format(token)
+                if key:
+                    counter[key] += 1
+    return counter
+
+
+def write_counts(counter: Counter, out_path: str, min_freq: int) -> int:
+    items = [(w, c) for (w, c) in counter.items() if c >= min_freq]
+    items.sort(key=lambda x: (-x[1], x[0]))
+    os.makedirs(os.path.dirname(os.path.abspath(out_path)), exist_ok=True)
+    with open(out_path, "w", encoding="utf-8") as f:
+        for word, freq in items:
+            f.write(f"{word} {freq}\n")
+    return len(items)
+
+
+def read_word_file(path: str) -> set[str]:
+    words: set[str] = set()
+    with open(os.path.expanduser(path), "r", encoding="utf-8") as fh:
+        for line in fh:
+            stripped = line.strip()
+            if not stripped:
+                continue
+            word = stripped.rsplit(" ", 1)[0]
+            words.add(normalize_word_key(word))
+    return words
+
+
+def compare_word_files(source_path: str, known_path: str) -> list[str]:
+    known = read_word_file(known_path)
+    new_words: list[str] = []
+    with open(os.path.expanduser(source_path), "r", encoding="utf-8") as fh:
+        for line in fh:
+            stripped = line.strip()
+            if not stripped:
+                continue
+            word = stripped.rsplit(" ", 1)[0]
+            if normalize_word_key(word) not in known:
+                new_words.append(stripped)
+    return new_words
+
+
+def extract_words(
+    config: Config,
+    lang: str,
+    query: str | None = None,
+    decks: list[str] | None = None,
+    field: str | None = None,
+    min_freq: int = 2,
+    outdir: str | None = None,
+    out: str | None = None,
+    full_field: bool = False,
+    spacy_model: str | None = None,
+    request: Callable = anki_request,
+) -> dict[str, object]:
+    language_bucket = config.language_name(lang)
+    profile = LANGUAGE_PROFILES[language_bucket]
+    search_query = query or build_query_from_decks(decks or config.decks_for(lang))
+    out_dir = os.path.expanduser(outdir) if outdir else os.path.join(config.word_output_root, language_bucket)
+    out_path = os.path.expanduser(out) if out else os.path.join(out_dir, f"words_{lang}.txt")
+    model_name = spacy_model or str(config.language(lang).get("word_model"))
+    nlp = load_spacy_model(model_name)
+    notes = get_notes(search_query, config, request=request)
+    if notes:
+        fields0 = (notes[0].get("fields", {}) or {})
+        field_name = field or config.field_for(lang)
+        if field_name not in fields0:
+            raise RuntimeError(f"Field '{field_name}' not found. Available fields: {list(fields0.keys())}")
+    else:
+        field_name = field or config.field_for(lang)
+    counter = extract_counts(notes, field_name, nlp, profile["token_filter"], profile["output_format"], full_field)
+    written = write_counts(counter, out_path, min_freq)
+    return {"query": search_query, "notes": len(notes), "unique": len(counter), "written": written, "out": out_path}
+
@@ -0,0 +1,179 @@
+"""YouTube transcript mining and Anki-ready exports."""
+
+from __future__ import annotations
+
+import csv
+import os
+import re
+from collections import Counter
+from dataclasses import dataclass
+from urllib.parse import parse_qs, urlparse
+
+from youtube_transcript_api import YouTubeTranscriptApi
+
+from .config import Config
+from .text import normalize_word_key
+from .words import read_word_file
+
+STOPWORDS = {
+    "es": {
+        "de", "la", "que", "el", "en", "y", "a", "los", "del", "se", "las", "por",
+        "un", "para", "con", "no", "una", "su", "al", "lo", "como",
+    },
+    "en": {"the", "is", "and", "of", "to", "in", "it", "that", "on", "you", "this", "for", "with"},
+    "ja": {"の", "に", "は", "を", "た", "が", "で", "て", "です", "ます", "する", "ある", "いる"},
+}
+
+
+@dataclass(frozen=True)
+class TranscriptLine:
+    start: float
+    text: str
+
+
+def extract_video_id(url_or_id: str) -> str:
+    if "youtube" in url_or_id or "youtu.be" in url_or_id:
+        query = urlparse(url_or_id)
+        if query.hostname == "youtu.be":
+            return query.path.lstrip("/")
+        if query.hostname in ("www.youtube.com", "youtube.com", "m.youtube.com"):
+            values = parse_qs(query.query).get("v", [])
+            if values:
+                return values[0]
+    return url_or_id
+
+
+def video_url(video_or_id: str) -> str:
+    video_id = extract_video_id(video_or_id)
+    return f"https://www.youtube.com/watch?v={video_id}"
+
+
+def fetch_transcript(video_id: str, lang_code: str):
+    if hasattr(YouTubeTranscriptApi, "fetch"):
+        api = YouTubeTranscriptApi()
+        return api.fetch(video_id, languages=[lang_code])
+    if hasattr(YouTubeTranscriptApi, "get_transcript"):
+        return YouTubeTranscriptApi.get_transcript(video_id, languages=[lang_code])
+    raise RuntimeError("Unsupported youtube-transcript-api version.")
+
+
+def snippet_text(entry) -> str:
+    if isinstance(entry, dict):
+        return entry.get("text", "") or ""
+    return getattr(entry, "text", "") or ""
+
+
+def snippet_start(entry) -> float:
+    if isinstance(entry, dict):
+        return float(entry.get("start", 0.0) or 0.0)
+    return float(getattr(entry, "start", 0.0) or 0.0)
+
+
+def transcript_lines(entries) -> list[TranscriptLine]:
+    lines: list[TranscriptLine] = []
+    for entry in entries:
+        text = snippet_text(entry).replace("\n", " ").strip()
+        if text:
+            lines.append(TranscriptLine(snippet_start(entry), text))
+    return lines
+
+
+def tokenize_japanese(text: str) -> list[str]:
+    try:
+        from fugashi import Tagger
+    except ImportError as e:
+        raise RuntimeError('Japanese requires fugashi. Install: pip install "fugashi[unidic-lite]"') from e
+    tagger = Tagger()
+    return [w.surface for w in tagger(text)]
+
+
+def tokenize_spanish(text: str, raw: bool = False) -> list[str]:
+    tokens = re.findall(r"\b[\wáéíóúñü]+\b", text)
+    return tokens if raw else [t.lower() for t in tokens]
+
+
+def tokenize_text(text: str, lang_code: str, raw: bool = False) -> list[str]:
+    return tokenize_japanese(text) if lang_code == "ja" else tokenize_spanish(text, raw=raw)
+
+
+def count_words(tokens: list[str], lang_code: str, remove_stopwords: bool = True) -> Counter:
+    if remove_stopwords:
+        stopwords = STOPWORDS.get(lang_code, set())
+        tokens = [t for t in tokens if t not in stopwords]
+    return Counter(tokens)
+
+
+def sentence_vocab(sentence: str, lang_code: str, known_words: set[str] | None = None) -> list[str]:
+    words: list[str] = []
+    seen: set[str] = set()
+    for token in tokenize_text(sentence, lang_code):
+        key = normalize_word_key(token)
+        if key in seen or key in STOPWORDS.get(lang_code, set()):
+            continue
+        if known_words is not None and key in known_words:
+            continue
+        seen.add(key)
+        words.append(token)
+    return words
+
+
+def write_sentence_export(
+    lines: list[TranscriptLine],
+    out_path: str,
+    video: str,
+    lang_code: str,
+    delimiter: str = "\t",
+    known_words_path: str | None = None,
+    only_new: bool = False,
+) -> int:
+    known = read_word_file(known_words_path) if known_words_path else None
+    os.makedirs(os.path.dirname(os.path.abspath(out_path)), exist_ok=True)
+    written = 0
+    with open(out_path, "w", encoding="utf-8", newline="") as fh:
+        writer = csv.writer(fh, delimiter=delimiter)
+        writer.writerow(["sentence", "timestamp", "video_url", "vocab_guess"])
+        for line in lines:
+            vocab = sentence_vocab(line.text, lang_code, known)
+            if only_new and not vocab:
+                continue
+            writer.writerow([line.text, f"{line.start:.2f}", video_url(video), ", ".join(vocab)])
+            written += 1
+    return written
+
+
+def run_youtube(
+    config: Config,
+    lang: str,
+    video: str,
+    mode: str = "vocab",
+    top: int | None = None,
+    no_stopwords: bool = False,
+    raw: bool = False,
+    out: str | None = None,
+    fmt: str = "tsv",
+    known_words: str | None = None,
+    only_new: bool = False,
+) -> dict[str, object]:
+    lang_code = config.transcript_code(lang)
+    video_id = extract_video_id(video)
+    entries = fetch_transcript(video_id, lang_code)
+    lines = transcript_lines(entries)
+
+    if mode == "sentences":
+        if out:
+            delimiter = "," if fmt == "csv" else "\t"
+            written = write_sentence_export(lines, out, video_id, lang_code, delimiter, known_words, only_new)
+            return {"mode": mode, "lines": len(lines), "written": written, "out": out}
+        return {"mode": mode, "lines": lines}
+
+    text = " ".join(line.text for line in lines)
+    tokens = tokenize_text(text, lang_code, raw=raw)
+    counts = count_words(tokens, lang_code, remove_stopwords=not no_stopwords)
+    items = counts.most_common(top) if top else counts.most_common()
+    if out:
+        os.makedirs(os.path.dirname(os.path.abspath(out)), exist_ok=True)
+        with open(out, "w", encoding="utf-8") as fh:
+            for word, count in items:
+                fh.write(f"{word} {count}\n")
+    return {"mode": mode, "items": items, "out": out}
+
				`@@ -0,0 +1,2 @@`
				`"""Utilities for Anki-based language learning workflows."""`