Rename project to Saiki and unify CLI

This commit is contained in:
Pawel
2026-05-26 18:09:26 -04:00
parent 8ee1f8de25
commit f38030238c
19 changed files with 1274 additions and 1326 deletions

2
saiki/__init__.py Normal file
View File

@@ -0,0 +1,2 @@
"""Utilities for Anki-based language learning workflows."""

19
saiki/ankiconnect.py Normal file
View File

@@ -0,0 +1,19 @@
"""Small AnkiConnect client."""
from __future__ import annotations
import requests
def anki_request(action: str, url: str = "http://localhost:8765", **params):
resp = requests.post(
url,
json={"action": action, "version": 6, "params": params},
timeout=30,
)
resp.raise_for_status()
data = resp.json()
if data.get("error") is not None:
raise RuntimeError(f"AnkiConnect error for {action}: {data['error']}")
return data["result"]

126
saiki/audio.py Normal file
View File

@@ -0,0 +1,126 @@
"""Extract Anki audio media into playlists."""
from __future__ import annotations
import os
import re
import shutil
import subprocess
import tempfile
from typing import Callable
from .ankiconnect import anki_request
from .config import Config
AUDIO_EXTS = (".mp3", ".wav", ".ogg", ".m4a", ".flac")
def resolve_media_paths(media_dir: str, out_dir: str, media_name: str) -> tuple[str, str] | None:
normalized = os.path.normpath(media_name)
if os.path.isabs(normalized) or normalized.startswith(".."):
return None
return os.path.join(media_dir, normalized), os.path.join(out_dir, normalized)
def build_playlist(out_dir: str, language: str) -> str:
m3u_path = os.path.join(out_dir, f"{language}.m3u")
concat_name = f"{language}_concat.mp3"
files: list[str] = []
for root, _, filenames in os.walk(out_dir):
for fname in filenames:
abs_path = os.path.join(root, fname)
rel_path = os.path.relpath(abs_path, out_dir)
if rel_path in {os.path.basename(m3u_path), concat_name}:
continue
if fname.lower().endswith(AUDIO_EXTS) and os.path.isfile(abs_path):
files.append(rel_path)
with open(m3u_path, "w", encoding="utf-8") as fh:
for fname in sorted(files):
fh.write(f"{fname}\n")
return m3u_path
def concat_audio_from_m3u(out_dir: str, m3u_path: str, out_path: str) -> None:
if shutil.which("ffmpeg") is None:
raise RuntimeError("ffmpeg not found in PATH. Install ffmpeg to use --concat.")
with open(m3u_path, "r", encoding="utf-8") as fh:
rel_files = [line.strip() for line in fh if line.strip()]
abs_files = [
os.path.abspath(os.path.join(out_dir, rel))
for rel in rel_files
if os.path.isfile(os.path.join(out_dir, rel)) and rel.lower().endswith(AUDIO_EXTS)
]
if not abs_files:
raise RuntimeError("No audio files found to concatenate.")
with tempfile.NamedTemporaryFile("w", delete=False, encoding="utf-8") as tmp:
concat_list_path = tmp.name
for path in abs_files:
tmp.write(f"file '{path.replace(chr(39), chr(39) + chr(92) + chr(39) + chr(39))}'\n")
cmd = [
"ffmpeg", "-hide_banner", "-loglevel", "error", "-f", "concat", "-safe", "0",
"-i", concat_list_path, "-c:a", "libmp3lame", "-q:a", "4", "-y", out_path,
]
try:
subprocess.run(cmd, check=True)
finally:
try:
os.remove(concat_list_path)
except OSError:
pass
def extract_audio(
config: Config,
lang: str,
outdir: str | None = None,
media_dir: str | None = None,
copy_only_new: bool = False,
concat: bool = False,
request: Callable = anki_request,
) -> dict[str, object]:
language = config.language_name(lang)
selected_decks = config.decks_for(lang)
if not selected_decks:
raise RuntimeError(f"No decks configured for language: {lang}")
media_root = media_dir or config.media_dir
out_dir = os.path.expanduser(outdir) if outdir else os.path.join(config.audio_output_root, language)
os.makedirs(out_dir, exist_ok=True)
all_ids: list[int] = []
for deck in selected_decks:
all_ids.extend(request("findNotes", url=config.anki_connect_url, query=f'deck:"{deck}"') or [])
if not all_ids:
return {"copied": 0, "playlist": build_playlist(out_dir, language), "outdir": out_dir, "concat": None}
notes = request("notesInfo", url=config.anki_connect_url, notes=all_ids) or []
copied: list[str] = []
for note in notes:
for field in (note.get("fields", {}) or {}).values():
val = field.get("value", "") or ""
for match in re.findall(r"\[sound:(.+?)\]", val):
paths = resolve_media_paths(media_root, out_dir, match)
if paths is None:
continue
src, dst = paths
if not os.path.exists(src):
continue
os.makedirs(os.path.dirname(dst), exist_ok=True)
if copy_only_new and os.path.exists(dst):
continue
shutil.copy2(src, dst)
copied.append(match)
m3u_path = build_playlist(out_dir, language)
concat_path = None
if concat:
concat_path = os.path.join(out_dir, f"{language}_concat.mp3")
concat_audio_from_m3u(out_dir, m3u_path, concat_path)
return {"copied": len(copied), "playlist": m3u_path, "outdir": out_dir, "concat": concat_path}

126
saiki/cli.py Normal file
View File

@@ -0,0 +1,126 @@
"""Unified command-line interface for Saiki."""
from __future__ import annotations
import argparse
import sys
from .audio import extract_audio
from .config import Config, language_choices, load_config
from .importer import import_sentences
from .words import compare_word_files, extract_words
from .youtube import run_youtube
def add_config_arg(parser: argparse.ArgumentParser) -> None:
parser.add_argument("--config", help="Path to YAML config file.")
def build_parser(config: Config | None = None) -> argparse.ArgumentParser:
choices = language_choices(config or load_config())
parser = argparse.ArgumentParser(description="Saiki: sentence mining and listening tools for Anki.")
add_config_arg(parser)
sub = parser.add_subparsers(dest="command", required=True)
audio = sub.add_parser("audio", help="Extract Anki audio into playlists.")
audio.add_argument("lang", choices=choices)
audio.add_argument("--concat", action="store_true")
audio.add_argument("--outdir")
audio.add_argument("--media-dir")
audio.add_argument("--copy-only-new", action="store_true")
words = sub.add_parser("words", help="Extract frequent words from Anki.")
words.add_argument("lang", choices=choices)
group = words.add_mutually_exclusive_group()
group.add_argument("--query")
group.add_argument("--deck", action="append")
words.add_argument("--field")
words.add_argument("--min-freq", type=int, default=2)
words.add_argument("--outdir")
words.add_argument("--out")
words.add_argument("--full-field", action="store_true")
words.add_argument("--spacy-model")
compare = sub.add_parser("compare-words", help="Print words in source that are not in known.")
compare.add_argument("source")
compare.add_argument("known")
youtube = sub.add_parser("youtube", help="Mine a YouTube transcript.")
youtube.add_argument("lang", choices=choices)
youtube.add_argument("video")
youtube.add_argument("--mode", choices=["vocab", "sentences"], default="vocab")
youtube.add_argument("--top", type=int)
youtube.add_argument("--no-stopwords", action="store_true")
youtube.add_argument("--raw", action="store_true")
youtube.add_argument("--out")
youtube.add_argument("--format", choices=["tsv", "csv"], default="tsv")
youtube.add_argument("--known-words", help="Word list to filter vocab_guess against.")
youtube.add_argument("--only-new", action="store_true", help="Only export sentences with unknown vocab.")
importer = sub.add_parser("import", help="Generate TTS and import sentence cards.")
importer.add_argument("lang", choices=choices)
importer.add_argument("sentence_file", nargs="?")
importer.add_argument("--tags", help="Comma-separated tags. text-to-speech is always included.")
return parser
def main(argv: list[str] | None = None) -> int:
pre = argparse.ArgumentParser(add_help=False)
add_config_arg(pre)
known, _ = pre.parse_known_args(argv)
config = load_config(known.config)
parser = build_parser(config)
args = parser.parse_args(argv)
if args.command == "audio":
result = extract_audio(config, args.lang, args.outdir, args.media_dir, args.copy_only_new, args.concat)
print(f"Copied {result['copied']} files")
print(f"Playlist: {result['playlist']}")
print(f"Output directory: {result['outdir']}")
if result["concat"]:
print(f"Concatenated file: {result['concat']}")
return 0
if args.command == "words":
result = extract_words(
config, args.lang, args.query, args.deck, args.field, args.min_freq,
args.outdir, args.out, args.full_field, args.spacy_model,
)
print(f"Query: {result['query']}")
print(f"Found {result['notes']} notes")
print(f"Extracted {result['unique']} unique entries")
print(f"Wrote {result['written']} entries to: {result['out']}")
return 0
if args.command == "compare-words":
for line in compare_word_files(args.source, args.known):
print(line)
return 0
if args.command == "youtube":
result = run_youtube(
config, args.lang, args.video, args.mode, args.top, args.no_stopwords,
args.raw, args.out, args.format, args.known_words, args.only_new,
)
if args.mode == "sentences" and not args.out:
for line in result["lines"]:
print(f"[{line.start:.2f}s] {line.text}")
elif args.mode == "sentences":
print(f"Wrote {result['written']} rows to: {result['out']}")
else:
for word, count in result["items"]:
print(f"{word}: {count}")
return 0
if args.command == "import":
result = import_sentences(config, args.lang, args.sentence_file, args.tags)
print(f"Done. Added {result.added}/{result.processed} cards. Failed: {result.failed}")
return 0 if result.failed == 0 else 1
parser.print_help()
return 2
if __name__ == "__main__":
raise SystemExit(main(sys.argv[1:]))

148
saiki/config.py Normal file
View File

@@ -0,0 +1,148 @@
"""Configuration loading for Saiki.
Defaults mirror the original scripts. Users can override them with YAML at
~/.config/saiki/config.yaml or by passing --config to the CLI.
"""
from __future__ import annotations
import copy
import os
from dataclasses import dataclass
from typing import Any
try:
import yaml
except Exception: # pragma: no cover - handled when config files are loaded
yaml = None
DEFAULT_CONFIG: dict[str, Any] = {
"anki_connect_url": "http://localhost:8765",
"media_dir": "~/.var/app/net.ankiweb.Anki/data/Anki2/User 1/collection.media",
"audio_output_root": "~/Languages/Anki/anki-audio",
"word_output_root": "~/Languages/Anki/anki-words",
"sentence_dir": "~/Languages/Anki",
"note_model": "Basic",
"fields": {"front": "Front", "back": "Back"},
"languages": {
"jp": {
"name": "japanese",
"transcript_code": "ja",
"tts_code": "ja",
"tts_tld": "com",
"tts_tempo": 1.35,
"decks": ["日本語"],
"word_model": "ja_core_news_lg",
"field": "Back",
"sentence_file": "sentences_jp.txt",
},
"es": {
"name": "spanish",
"transcript_code": "es",
"tts_code": "es",
"tts_tld": "es",
"tts_tempo": 1.25,
"decks": ["Español"],
"word_model": "es_core_news_sm",
"field": "Back",
"sentence_file": "sentences_es.txt",
},
},
}
@dataclass(frozen=True)
class Config:
data: dict[str, Any]
@property
def anki_connect_url(self) -> str:
return str(self.data["anki_connect_url"])
@property
def media_dir(self) -> str:
return expand_path(str(self.data["media_dir"]))
@property
def audio_output_root(self) -> str:
return expand_path(str(self.data["audio_output_root"]))
@property
def word_output_root(self) -> str:
return expand_path(str(self.data["word_output_root"]))
@property
def sentence_dir(self) -> str:
return expand_path(str(self.data["sentence_dir"]))
@property
def note_model(self) -> str:
return str(self.data.get("note_model", "Basic"))
@property
def fields(self) -> dict[str, str]:
return dict(self.data.get("fields", {}))
@property
def languages(self) -> dict[str, dict[str, Any]]:
return dict(self.data.get("languages", {}))
def language(self, lang: str) -> dict[str, Any]:
try:
return dict(self.languages[lang])
except KeyError as e:
available = ", ".join(sorted(self.languages))
raise ValueError(f"Unsupported language '{lang}'. Available: {available}") from e
def language_name(self, lang: str) -> str:
return str(self.language(lang)["name"])
def transcript_code(self, lang: str) -> str:
return str(self.language(lang)["transcript_code"])
def decks_for(self, lang: str) -> list[str]:
return list(self.language(lang).get("decks", []))
def field_for(self, lang: str) -> str:
return str(self.language(lang).get("field", self.fields.get("back", "Back")))
def sentence_file_for(self, lang: str) -> str:
value = str(self.language(lang).get("sentence_file", f"sentences_{lang}.txt"))
return expand_path(value if os.path.isabs(value) or value.startswith("~") else os.path.join(self.sentence_dir, value))
def expand_path(path: str) -> str:
return os.path.expanduser(os.path.expandvars(path))
def default_config_path() -> str:
return expand_path("~/.config/saiki/config.yaml")
def deep_merge(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
result = copy.deepcopy(base)
for key, value in override.items():
if isinstance(value, dict) and isinstance(result.get(key), dict):
result[key] = deep_merge(result[key], value)
else:
result[key] = value
return result
def load_config(path: str | None = None) -> Config:
config = copy.deepcopy(DEFAULT_CONFIG)
config_path = expand_path(path) if path else default_config_path()
if os.path.exists(config_path):
if yaml is None:
raise RuntimeError("Loading config files requires PyYAML. Install pyyaml.")
with open(config_path, "r", encoding="utf-8") as fh:
loaded = yaml.safe_load(fh) or {}
if not isinstance(loaded, dict):
raise RuntimeError(f"Config must be a YAML mapping: {config_path}")
config = deep_merge(config, loaded)
return Config(config)
def language_choices(config: Config) -> list[str]:
return sorted(config.languages.keys())

112
saiki/importer.py Normal file
View File

@@ -0,0 +1,112 @@
"""Generate TTS audio and add sentence notes to Anki."""
from __future__ import annotations
import os
import csv
import shutil
import subprocess
import tempfile
import time
from dataclasses import dataclass
from typing import Callable
from .ankiconnect import anki_request
from .config import Config
@dataclass(frozen=True)
class ImportResult:
processed: int
added: int
failed: int
def parse_tags(value: str | None) -> list[str]:
tags = ["text-to-speech"]
if value:
tags.extend(tag.strip() for tag in value.split(",") if tag.strip())
else:
tags.append("AI-generated")
return tags
def require_command(name: str) -> None:
if shutil.which(name) is None:
raise RuntimeError(f"Required command not found: {name}")
def generate_tts(sentence: str, raw_output: str, lang_code: str, tld: str) -> None:
subprocess.run(["gtts-cli", sentence, "--lang", lang_code, "--tld", tld, "--output", raw_output], check=True)
def speed_audio(raw_output: str, output_path: str, tempo: float) -> None:
subprocess.run(
["ffmpeg", "-loglevel", "error", "-i", raw_output, "-filter:a", f"atempo={tempo}", "-y", output_path],
stdin=subprocess.DEVNULL,
check=True,
)
def read_sentences(path: str) -> list[str]:
expanded = os.path.expanduser(path)
if expanded.lower().endswith((".tsv", ".csv")):
delimiter = "\t" if expanded.lower().endswith(".tsv") else ","
with open(expanded, "r", encoding="utf-8", newline="") as fh:
reader = csv.DictReader(fh, delimiter=delimiter)
if reader.fieldnames and "sentence" in reader.fieldnames:
return [row["sentence"].strip() for row in reader if row.get("sentence", "").strip()]
raise RuntimeError("TSV/CSV sentence imports must include a 'sentence' header.")
with open(expanded, "r", encoding="utf-8") as fh:
return [line.strip() for line in fh if line.strip()]
def import_sentences(
config: Config,
lang: str,
sentence_file: str | None = None,
tags_value: str | None = None,
request: Callable = anki_request,
) -> ImportResult:
require_command("gtts-cli")
require_command("ffmpeg")
language = config.language(lang)
decks = list(language.get("decks", []))
if not decks:
raise RuntimeError(f"No deck configured for language: {lang}")
deck = decks[0]
source = os.path.expanduser(sentence_file) if sentence_file else config.sentence_file_for(lang)
sentences = read_sentences(source)
tags = parse_tags(tags_value)
front_field = config.fields.get("front", "Front")
back_field = config.fields.get("back", "Back")
added = 0
failed = 0
with tempfile.TemporaryDirectory() as temp_dir:
for sentence in sentences:
basename = f"tts_{time.strftime('%Y%m%d_%H%M%S')}_{lang}_{os.getpid()}_{added + failed}"
raw_output = os.path.join(temp_dir, f"{basename}_original.mp3")
output_path = os.path.join(temp_dir, f"{basename}.mp3")
try:
generate_tts(sentence, raw_output, str(language["tts_code"]), str(language["tts_tld"]))
speed_audio(raw_output, output_path, float(language["tts_tempo"]))
request(
"addNote",
url=config.anki_connect_url,
note={
"deckName": deck,
"modelName": config.note_model,
"fields": {front_field: "", back_field: sentence},
"options": {"allowDuplicate": False},
"tags": tags,
"audio": [{"path": output_path, "filename": f"{basename}.mp3", "fields": [front_field]}],
},
)
added += 1
except Exception:
failed += 1
return ImportResult(processed=len(sentences), added=added, failed=failed)

29
saiki/text.py Normal file
View File

@@ -0,0 +1,29 @@
"""Text cleanup helpers shared by tools."""
from __future__ import annotations
from html import unescape
import regex as re
def extract_first_visible_line(text: str) -> str:
text = unescape(text or "")
text = re.sub(r"</?(br|div|p)[^>]*>", "\n", text, flags=re.IGNORECASE)
text = re.sub(r"<[^>]+>", "", text)
text = text.strip()
return text.splitlines()[0] if text else ""
def extract_visible_text(text: str) -> str:
text = unescape(text or "")
text = re.sub(r"</?(br|div|p)[^>]*>", "\n", text, flags=re.IGNORECASE)
text = re.sub(r"<[^>]+>", "", text)
text = re.sub(r"[ \t]+", " ", text)
text = re.sub(r"\n{2,}", "\n", text)
return text.strip()
def normalize_word_key(value: str) -> str:
return re.sub(r"\s+", " ", value.strip().lower())

183
saiki/words.py Normal file
View File

@@ -0,0 +1,183 @@
"""Extract and compare language-learning vocabulary."""
from __future__ import annotations
import logging
import os
from collections import Counter
from typing import Callable
import regex as re
from .ankiconnect import anki_request
from .config import Config
from .text import extract_first_visible_line, extract_visible_text, normalize_word_key
JAPANESE_CHAR_RE = re.compile(r"[\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Han}ー]+")
JAPANESE_PARTICLES = {
"", "", "", "", "", "", "", "", "", "から", "まで", "より", "", "なら",
"", "", "", "", "", "", "", "", "", "って", "とき", "ってば", "けど", "けれど",
"しかし", "でも", "ながら", "ほど", "", "もの", "こと", "ところ", "よう", "らしい", "られる",
}
JAPANESE_GRAMMAR_EXCLUDE = {
"", "", "ます", "れる", "てる", "", "", "しまう", "いる", "ない", "なる", "ある", "", "です",
}
JAPANESE_ALLOWED_POS = {"NOUN", "PROPN", "VERB", "ADJ"}
def setup_logging(logfile: str) -> None:
os.makedirs(os.path.dirname(os.path.abspath(logfile)), exist_ok=True)
logging.basicConfig(filename=logfile, level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
def build_query_from_decks(decks: list[str]) -> str:
return " OR ".join(f'deck:"{d}"' for d in decks)
def japanese_filter(token) -> bool:
text = (token.text or "").strip()
lemma = (token.lemma_ or "").strip()
if not text or not JAPANESE_CHAR_RE.fullmatch(text):
return False
if lemma in JAPANESE_GRAMMAR_EXCLUDE or text in JAPANESE_PARTICLES:
return False
if getattr(token, "pos_", None) not in JAPANESE_ALLOWED_POS:
return False
if getattr(token, "is_stop", False) or getattr(token, "like_url", False) or getattr(token, "like_email", False):
return False
if any(c in text for c in "<>=/\\:&%"):
return False
return text not in {"ruby", "rt", "div", "br", "nbsp", "href", "strong", "a"}
def spanish_filter(token) -> bool:
return bool(getattr(token, "is_alpha", False)) and not bool(getattr(token, "is_stop", False))
def spanish_format(token) -> str:
return (token.lemma_ or token.text or "").lower().strip()
def japanese_format(token) -> str:
lemma = (token.lemma_ or "").strip()
surface = (token.text or "").strip()
if lemma and surface and lemma != surface:
return f"{lemma} ({surface})"
return lemma or surface
LANGUAGE_PROFILES = {
"spanish": {"token_filter": spanish_filter, "output_format": spanish_format},
"japanese": {"token_filter": japanese_filter, "output_format": japanese_format},
}
def load_spacy_model(model_name: str):
try:
import spacy # type: ignore
except Exception as e:
raise RuntimeError("Failed to import spaCy. Use a Python version supported by spaCy.") from e
try:
return spacy.load(model_name)
except Exception as e:
raise RuntimeError(f"Failed to load spaCy model '{model_name}'. Try: python -m spacy download {model_name}") from e
def get_notes(query: str, config: Config, request: Callable = anki_request) -> list[dict]:
note_ids = request("findNotes", url=config.anki_connect_url, query=query) or []
if not note_ids:
return []
return request("notesInfo", url=config.anki_connect_url, notes=note_ids) or []
def extract_counts(
notes: list[dict],
field_name: str,
nlp,
token_filter: Callable,
output_format: Callable,
use_full_field: bool,
) -> Counter:
counter: Counter = Counter()
for note in notes:
fields = note.get("fields", {}) or {}
raw_val = (fields.get(field_name, {}) or {}).get("value", "") or ""
text = extract_visible_text(raw_val) if use_full_field else extract_first_visible_line(raw_val)
if not text:
continue
for token in nlp(text):
if token_filter(token):
key = output_format(token)
if key:
counter[key] += 1
return counter
def write_counts(counter: Counter, out_path: str, min_freq: int) -> int:
items = [(w, c) for (w, c) in counter.items() if c >= min_freq]
items.sort(key=lambda x: (-x[1], x[0]))
os.makedirs(os.path.dirname(os.path.abspath(out_path)), exist_ok=True)
with open(out_path, "w", encoding="utf-8") as f:
for word, freq in items:
f.write(f"{word} {freq}\n")
return len(items)
def read_word_file(path: str) -> set[str]:
words: set[str] = set()
with open(os.path.expanduser(path), "r", encoding="utf-8") as fh:
for line in fh:
stripped = line.strip()
if not stripped:
continue
word = stripped.rsplit(" ", 1)[0]
words.add(normalize_word_key(word))
return words
def compare_word_files(source_path: str, known_path: str) -> list[str]:
known = read_word_file(known_path)
new_words: list[str] = []
with open(os.path.expanduser(source_path), "r", encoding="utf-8") as fh:
for line in fh:
stripped = line.strip()
if not stripped:
continue
word = stripped.rsplit(" ", 1)[0]
if normalize_word_key(word) not in known:
new_words.append(stripped)
return new_words
def extract_words(
config: Config,
lang: str,
query: str | None = None,
decks: list[str] | None = None,
field: str | None = None,
min_freq: int = 2,
outdir: str | None = None,
out: str | None = None,
full_field: bool = False,
spacy_model: str | None = None,
request: Callable = anki_request,
) -> dict[str, object]:
language_bucket = config.language_name(lang)
profile = LANGUAGE_PROFILES[language_bucket]
search_query = query or build_query_from_decks(decks or config.decks_for(lang))
out_dir = os.path.expanduser(outdir) if outdir else os.path.join(config.word_output_root, language_bucket)
out_path = os.path.expanduser(out) if out else os.path.join(out_dir, f"words_{lang}.txt")
model_name = spacy_model or str(config.language(lang).get("word_model"))
nlp = load_spacy_model(model_name)
notes = get_notes(search_query, config, request=request)
if notes:
fields0 = (notes[0].get("fields", {}) or {})
field_name = field or config.field_for(lang)
if field_name not in fields0:
raise RuntimeError(f"Field '{field_name}' not found. Available fields: {list(fields0.keys())}")
else:
field_name = field or config.field_for(lang)
counter = extract_counts(notes, field_name, nlp, profile["token_filter"], profile["output_format"], full_field)
written = write_counts(counter, out_path, min_freq)
return {"query": search_query, "notes": len(notes), "unique": len(counter), "written": written, "out": out_path}

179
saiki/youtube.py Normal file
View File

@@ -0,0 +1,179 @@
"""YouTube transcript mining and Anki-ready exports."""
from __future__ import annotations
import csv
import os
import re
from collections import Counter
from dataclasses import dataclass
from urllib.parse import parse_qs, urlparse
from youtube_transcript_api import YouTubeTranscriptApi
from .config import Config
from .text import normalize_word_key
from .words import read_word_file
STOPWORDS = {
"es": {
"de", "la", "que", "el", "en", "y", "a", "los", "del", "se", "las", "por",
"un", "para", "con", "no", "una", "su", "al", "lo", "como",
},
"en": {"the", "is", "and", "of", "to", "in", "it", "that", "on", "you", "this", "for", "with"},
"ja": {"", "", "", "", "", "", "", "", "です", "ます", "する", "ある", "いる"},
}
@dataclass(frozen=True)
class TranscriptLine:
start: float
text: str
def extract_video_id(url_or_id: str) -> str:
if "youtube" in url_or_id or "youtu.be" in url_or_id:
query = urlparse(url_or_id)
if query.hostname == "youtu.be":
return query.path.lstrip("/")
if query.hostname in ("www.youtube.com", "youtube.com", "m.youtube.com"):
values = parse_qs(query.query).get("v", [])
if values:
return values[0]
return url_or_id
def video_url(video_or_id: str) -> str:
video_id = extract_video_id(video_or_id)
return f"https://www.youtube.com/watch?v={video_id}"
def fetch_transcript(video_id: str, lang_code: str):
if hasattr(YouTubeTranscriptApi, "fetch"):
api = YouTubeTranscriptApi()
return api.fetch(video_id, languages=[lang_code])
if hasattr(YouTubeTranscriptApi, "get_transcript"):
return YouTubeTranscriptApi.get_transcript(video_id, languages=[lang_code])
raise RuntimeError("Unsupported youtube-transcript-api version.")
def snippet_text(entry) -> str:
if isinstance(entry, dict):
return entry.get("text", "") or ""
return getattr(entry, "text", "") or ""
def snippet_start(entry) -> float:
if isinstance(entry, dict):
return float(entry.get("start", 0.0) or 0.0)
return float(getattr(entry, "start", 0.0) or 0.0)
def transcript_lines(entries) -> list[TranscriptLine]:
lines: list[TranscriptLine] = []
for entry in entries:
text = snippet_text(entry).replace("\n", " ").strip()
if text:
lines.append(TranscriptLine(snippet_start(entry), text))
return lines
def tokenize_japanese(text: str) -> list[str]:
try:
from fugashi import Tagger
except ImportError as e:
raise RuntimeError('Japanese requires fugashi. Install: pip install "fugashi[unidic-lite]"') from e
tagger = Tagger()
return [w.surface for w in tagger(text)]
def tokenize_spanish(text: str, raw: bool = False) -> list[str]:
tokens = re.findall(r"\b[\wáéíóúñü]+\b", text)
return tokens if raw else [t.lower() for t in tokens]
def tokenize_text(text: str, lang_code: str, raw: bool = False) -> list[str]:
return tokenize_japanese(text) if lang_code == "ja" else tokenize_spanish(text, raw=raw)
def count_words(tokens: list[str], lang_code: str, remove_stopwords: bool = True) -> Counter:
if remove_stopwords:
stopwords = STOPWORDS.get(lang_code, set())
tokens = [t for t in tokens if t not in stopwords]
return Counter(tokens)
def sentence_vocab(sentence: str, lang_code: str, known_words: set[str] | None = None) -> list[str]:
words: list[str] = []
seen: set[str] = set()
for token in tokenize_text(sentence, lang_code):
key = normalize_word_key(token)
if key in seen or key in STOPWORDS.get(lang_code, set()):
continue
if known_words is not None and key in known_words:
continue
seen.add(key)
words.append(token)
return words
def write_sentence_export(
lines: list[TranscriptLine],
out_path: str,
video: str,
lang_code: str,
delimiter: str = "\t",
known_words_path: str | None = None,
only_new: bool = False,
) -> int:
known = read_word_file(known_words_path) if known_words_path else None
os.makedirs(os.path.dirname(os.path.abspath(out_path)), exist_ok=True)
written = 0
with open(out_path, "w", encoding="utf-8", newline="") as fh:
writer = csv.writer(fh, delimiter=delimiter)
writer.writerow(["sentence", "timestamp", "video_url", "vocab_guess"])
for line in lines:
vocab = sentence_vocab(line.text, lang_code, known)
if only_new and not vocab:
continue
writer.writerow([line.text, f"{line.start:.2f}", video_url(video), ", ".join(vocab)])
written += 1
return written
def run_youtube(
config: Config,
lang: str,
video: str,
mode: str = "vocab",
top: int | None = None,
no_stopwords: bool = False,
raw: bool = False,
out: str | None = None,
fmt: str = "tsv",
known_words: str | None = None,
only_new: bool = False,
) -> dict[str, object]:
lang_code = config.transcript_code(lang)
video_id = extract_video_id(video)
entries = fetch_transcript(video_id, lang_code)
lines = transcript_lines(entries)
if mode == "sentences":
if out:
delimiter = "," if fmt == "csv" else "\t"
written = write_sentence_export(lines, out, video_id, lang_code, delimiter, known_words, only_new)
return {"mode": mode, "lines": len(lines), "written": written, "out": out}
return {"mode": mode, "lines": lines}
text = " ".join(line.text for line in lines)
tokens = tokenize_text(text, lang_code, raw=raw)
counts = count_words(tokens, lang_code, remove_stopwords=not no_stopwords)
items = counts.most_common(top) if top else counts.most_common()
if out:
os.makedirs(os.path.dirname(os.path.abspath(out)), exist_ok=True)
with open(out, "w", encoding="utf-8") as fh:
for word, count in items:
fh.write(f"{word} {count}\n")
return {"mode": mode, "items": items, "out": out}