Rename project to Saiki and unify CLI
This commit is contained in:
2
saiki/__init__.py
Normal file
2
saiki/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
"""Utilities for Anki-based language learning workflows."""
|
||||
|
||||
19
saiki/ankiconnect.py
Normal file
19
saiki/ankiconnect.py
Normal file
@@ -0,0 +1,19 @@
|
||||
"""Small AnkiConnect client."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
def anki_request(action: str, url: str = "http://localhost:8765", **params):
|
||||
resp = requests.post(
|
||||
url,
|
||||
json={"action": action, "version": 6, "params": params},
|
||||
timeout=30,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
if data.get("error") is not None:
|
||||
raise RuntimeError(f"AnkiConnect error for {action}: {data['error']}")
|
||||
return data["result"]
|
||||
|
||||
126
saiki/audio.py
Normal file
126
saiki/audio.py
Normal file
@@ -0,0 +1,126 @@
|
||||
"""Extract Anki audio media into playlists."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
from typing import Callable
|
||||
|
||||
from .ankiconnect import anki_request
|
||||
from .config import Config
|
||||
|
||||
AUDIO_EXTS = (".mp3", ".wav", ".ogg", ".m4a", ".flac")
|
||||
|
||||
|
||||
def resolve_media_paths(media_dir: str, out_dir: str, media_name: str) -> tuple[str, str] | None:
|
||||
normalized = os.path.normpath(media_name)
|
||||
if os.path.isabs(normalized) or normalized.startswith(".."):
|
||||
return None
|
||||
return os.path.join(media_dir, normalized), os.path.join(out_dir, normalized)
|
||||
|
||||
|
||||
def build_playlist(out_dir: str, language: str) -> str:
|
||||
m3u_path = os.path.join(out_dir, f"{language}.m3u")
|
||||
concat_name = f"{language}_concat.mp3"
|
||||
files: list[str] = []
|
||||
for root, _, filenames in os.walk(out_dir):
|
||||
for fname in filenames:
|
||||
abs_path = os.path.join(root, fname)
|
||||
rel_path = os.path.relpath(abs_path, out_dir)
|
||||
if rel_path in {os.path.basename(m3u_path), concat_name}:
|
||||
continue
|
||||
if fname.lower().endswith(AUDIO_EXTS) and os.path.isfile(abs_path):
|
||||
files.append(rel_path)
|
||||
|
||||
with open(m3u_path, "w", encoding="utf-8") as fh:
|
||||
for fname in sorted(files):
|
||||
fh.write(f"{fname}\n")
|
||||
return m3u_path
|
||||
|
||||
|
||||
def concat_audio_from_m3u(out_dir: str, m3u_path: str, out_path: str) -> None:
|
||||
if shutil.which("ffmpeg") is None:
|
||||
raise RuntimeError("ffmpeg not found in PATH. Install ffmpeg to use --concat.")
|
||||
|
||||
with open(m3u_path, "r", encoding="utf-8") as fh:
|
||||
rel_files = [line.strip() for line in fh if line.strip()]
|
||||
|
||||
abs_files = [
|
||||
os.path.abspath(os.path.join(out_dir, rel))
|
||||
for rel in rel_files
|
||||
if os.path.isfile(os.path.join(out_dir, rel)) and rel.lower().endswith(AUDIO_EXTS)
|
||||
]
|
||||
if not abs_files:
|
||||
raise RuntimeError("No audio files found to concatenate.")
|
||||
|
||||
with tempfile.NamedTemporaryFile("w", delete=False, encoding="utf-8") as tmp:
|
||||
concat_list_path = tmp.name
|
||||
for path in abs_files:
|
||||
tmp.write(f"file '{path.replace(chr(39), chr(39) + chr(92) + chr(39) + chr(39))}'\n")
|
||||
|
||||
cmd = [
|
||||
"ffmpeg", "-hide_banner", "-loglevel", "error", "-f", "concat", "-safe", "0",
|
||||
"-i", concat_list_path, "-c:a", "libmp3lame", "-q:a", "4", "-y", out_path,
|
||||
]
|
||||
try:
|
||||
subprocess.run(cmd, check=True)
|
||||
finally:
|
||||
try:
|
||||
os.remove(concat_list_path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def extract_audio(
|
||||
config: Config,
|
||||
lang: str,
|
||||
outdir: str | None = None,
|
||||
media_dir: str | None = None,
|
||||
copy_only_new: bool = False,
|
||||
concat: bool = False,
|
||||
request: Callable = anki_request,
|
||||
) -> dict[str, object]:
|
||||
language = config.language_name(lang)
|
||||
selected_decks = config.decks_for(lang)
|
||||
if not selected_decks:
|
||||
raise RuntimeError(f"No decks configured for language: {lang}")
|
||||
|
||||
media_root = media_dir or config.media_dir
|
||||
out_dir = os.path.expanduser(outdir) if outdir else os.path.join(config.audio_output_root, language)
|
||||
os.makedirs(out_dir, exist_ok=True)
|
||||
|
||||
all_ids: list[int] = []
|
||||
for deck in selected_decks:
|
||||
all_ids.extend(request("findNotes", url=config.anki_connect_url, query=f'deck:"{deck}"') or [])
|
||||
|
||||
if not all_ids:
|
||||
return {"copied": 0, "playlist": build_playlist(out_dir, language), "outdir": out_dir, "concat": None}
|
||||
|
||||
notes = request("notesInfo", url=config.anki_connect_url, notes=all_ids) or []
|
||||
copied: list[str] = []
|
||||
for note in notes:
|
||||
for field in (note.get("fields", {}) or {}).values():
|
||||
val = field.get("value", "") or ""
|
||||
for match in re.findall(r"\[sound:(.+?)\]", val):
|
||||
paths = resolve_media_paths(media_root, out_dir, match)
|
||||
if paths is None:
|
||||
continue
|
||||
src, dst = paths
|
||||
if not os.path.exists(src):
|
||||
continue
|
||||
os.makedirs(os.path.dirname(dst), exist_ok=True)
|
||||
if copy_only_new and os.path.exists(dst):
|
||||
continue
|
||||
shutil.copy2(src, dst)
|
||||
copied.append(match)
|
||||
|
||||
m3u_path = build_playlist(out_dir, language)
|
||||
concat_path = None
|
||||
if concat:
|
||||
concat_path = os.path.join(out_dir, f"{language}_concat.mp3")
|
||||
concat_audio_from_m3u(out_dir, m3u_path, concat_path)
|
||||
return {"copied": len(copied), "playlist": m3u_path, "outdir": out_dir, "concat": concat_path}
|
||||
|
||||
126
saiki/cli.py
Normal file
126
saiki/cli.py
Normal file
@@ -0,0 +1,126 @@
|
||||
"""Unified command-line interface for Saiki."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
from .audio import extract_audio
|
||||
from .config import Config, language_choices, load_config
|
||||
from .importer import import_sentences
|
||||
from .words import compare_word_files, extract_words
|
||||
from .youtube import run_youtube
|
||||
|
||||
|
||||
def add_config_arg(parser: argparse.ArgumentParser) -> None:
|
||||
parser.add_argument("--config", help="Path to YAML config file.")
|
||||
|
||||
|
||||
def build_parser(config: Config | None = None) -> argparse.ArgumentParser:
|
||||
choices = language_choices(config or load_config())
|
||||
parser = argparse.ArgumentParser(description="Saiki: sentence mining and listening tools for Anki.")
|
||||
add_config_arg(parser)
|
||||
sub = parser.add_subparsers(dest="command", required=True)
|
||||
|
||||
audio = sub.add_parser("audio", help="Extract Anki audio into playlists.")
|
||||
audio.add_argument("lang", choices=choices)
|
||||
audio.add_argument("--concat", action="store_true")
|
||||
audio.add_argument("--outdir")
|
||||
audio.add_argument("--media-dir")
|
||||
audio.add_argument("--copy-only-new", action="store_true")
|
||||
|
||||
words = sub.add_parser("words", help="Extract frequent words from Anki.")
|
||||
words.add_argument("lang", choices=choices)
|
||||
group = words.add_mutually_exclusive_group()
|
||||
group.add_argument("--query")
|
||||
group.add_argument("--deck", action="append")
|
||||
words.add_argument("--field")
|
||||
words.add_argument("--min-freq", type=int, default=2)
|
||||
words.add_argument("--outdir")
|
||||
words.add_argument("--out")
|
||||
words.add_argument("--full-field", action="store_true")
|
||||
words.add_argument("--spacy-model")
|
||||
|
||||
compare = sub.add_parser("compare-words", help="Print words in source that are not in known.")
|
||||
compare.add_argument("source")
|
||||
compare.add_argument("known")
|
||||
|
||||
youtube = sub.add_parser("youtube", help="Mine a YouTube transcript.")
|
||||
youtube.add_argument("lang", choices=choices)
|
||||
youtube.add_argument("video")
|
||||
youtube.add_argument("--mode", choices=["vocab", "sentences"], default="vocab")
|
||||
youtube.add_argument("--top", type=int)
|
||||
youtube.add_argument("--no-stopwords", action="store_true")
|
||||
youtube.add_argument("--raw", action="store_true")
|
||||
youtube.add_argument("--out")
|
||||
youtube.add_argument("--format", choices=["tsv", "csv"], default="tsv")
|
||||
youtube.add_argument("--known-words", help="Word list to filter vocab_guess against.")
|
||||
youtube.add_argument("--only-new", action="store_true", help="Only export sentences with unknown vocab.")
|
||||
|
||||
importer = sub.add_parser("import", help="Generate TTS and import sentence cards.")
|
||||
importer.add_argument("lang", choices=choices)
|
||||
importer.add_argument("sentence_file", nargs="?")
|
||||
importer.add_argument("--tags", help="Comma-separated tags. text-to-speech is always included.")
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
pre = argparse.ArgumentParser(add_help=False)
|
||||
add_config_arg(pre)
|
||||
known, _ = pre.parse_known_args(argv)
|
||||
config = load_config(known.config)
|
||||
parser = build_parser(config)
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
if args.command == "audio":
|
||||
result = extract_audio(config, args.lang, args.outdir, args.media_dir, args.copy_only_new, args.concat)
|
||||
print(f"Copied {result['copied']} files")
|
||||
print(f"Playlist: {result['playlist']}")
|
||||
print(f"Output directory: {result['outdir']}")
|
||||
if result["concat"]:
|
||||
print(f"Concatenated file: {result['concat']}")
|
||||
return 0
|
||||
|
||||
if args.command == "words":
|
||||
result = extract_words(
|
||||
config, args.lang, args.query, args.deck, args.field, args.min_freq,
|
||||
args.outdir, args.out, args.full_field, args.spacy_model,
|
||||
)
|
||||
print(f"Query: {result['query']}")
|
||||
print(f"Found {result['notes']} notes")
|
||||
print(f"Extracted {result['unique']} unique entries")
|
||||
print(f"Wrote {result['written']} entries to: {result['out']}")
|
||||
return 0
|
||||
|
||||
if args.command == "compare-words":
|
||||
for line in compare_word_files(args.source, args.known):
|
||||
print(line)
|
||||
return 0
|
||||
|
||||
if args.command == "youtube":
|
||||
result = run_youtube(
|
||||
config, args.lang, args.video, args.mode, args.top, args.no_stopwords,
|
||||
args.raw, args.out, args.format, args.known_words, args.only_new,
|
||||
)
|
||||
if args.mode == "sentences" and not args.out:
|
||||
for line in result["lines"]:
|
||||
print(f"[{line.start:.2f}s] {line.text}")
|
||||
elif args.mode == "sentences":
|
||||
print(f"Wrote {result['written']} rows to: {result['out']}")
|
||||
else:
|
||||
for word, count in result["items"]:
|
||||
print(f"{word}: {count}")
|
||||
return 0
|
||||
|
||||
if args.command == "import":
|
||||
result = import_sentences(config, args.lang, args.sentence_file, args.tags)
|
||||
print(f"Done. Added {result.added}/{result.processed} cards. Failed: {result.failed}")
|
||||
return 0 if result.failed == 0 else 1
|
||||
|
||||
parser.print_help()
|
||||
return 2
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main(sys.argv[1:]))
|
||||
148
saiki/config.py
Normal file
148
saiki/config.py
Normal file
@@ -0,0 +1,148 @@
|
||||
"""Configuration loading for Saiki.
|
||||
|
||||
Defaults mirror the original scripts. Users can override them with YAML at
|
||||
~/.config/saiki/config.yaml or by passing --config to the CLI.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import copy
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
try:
|
||||
import yaml
|
||||
except Exception: # pragma: no cover - handled when config files are loaded
|
||||
yaml = None
|
||||
|
||||
|
||||
DEFAULT_CONFIG: dict[str, Any] = {
|
||||
"anki_connect_url": "http://localhost:8765",
|
||||
"media_dir": "~/.var/app/net.ankiweb.Anki/data/Anki2/User 1/collection.media",
|
||||
"audio_output_root": "~/Languages/Anki/anki-audio",
|
||||
"word_output_root": "~/Languages/Anki/anki-words",
|
||||
"sentence_dir": "~/Languages/Anki",
|
||||
"note_model": "Basic",
|
||||
"fields": {"front": "Front", "back": "Back"},
|
||||
"languages": {
|
||||
"jp": {
|
||||
"name": "japanese",
|
||||
"transcript_code": "ja",
|
||||
"tts_code": "ja",
|
||||
"tts_tld": "com",
|
||||
"tts_tempo": 1.35,
|
||||
"decks": ["日本語"],
|
||||
"word_model": "ja_core_news_lg",
|
||||
"field": "Back",
|
||||
"sentence_file": "sentences_jp.txt",
|
||||
},
|
||||
"es": {
|
||||
"name": "spanish",
|
||||
"transcript_code": "es",
|
||||
"tts_code": "es",
|
||||
"tts_tld": "es",
|
||||
"tts_tempo": 1.25,
|
||||
"decks": ["Español"],
|
||||
"word_model": "es_core_news_sm",
|
||||
"field": "Back",
|
||||
"sentence_file": "sentences_es.txt",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Config:
|
||||
data: dict[str, Any]
|
||||
|
||||
@property
|
||||
def anki_connect_url(self) -> str:
|
||||
return str(self.data["anki_connect_url"])
|
||||
|
||||
@property
|
||||
def media_dir(self) -> str:
|
||||
return expand_path(str(self.data["media_dir"]))
|
||||
|
||||
@property
|
||||
def audio_output_root(self) -> str:
|
||||
return expand_path(str(self.data["audio_output_root"]))
|
||||
|
||||
@property
|
||||
def word_output_root(self) -> str:
|
||||
return expand_path(str(self.data["word_output_root"]))
|
||||
|
||||
@property
|
||||
def sentence_dir(self) -> str:
|
||||
return expand_path(str(self.data["sentence_dir"]))
|
||||
|
||||
@property
|
||||
def note_model(self) -> str:
|
||||
return str(self.data.get("note_model", "Basic"))
|
||||
|
||||
@property
|
||||
def fields(self) -> dict[str, str]:
|
||||
return dict(self.data.get("fields", {}))
|
||||
|
||||
@property
|
||||
def languages(self) -> dict[str, dict[str, Any]]:
|
||||
return dict(self.data.get("languages", {}))
|
||||
|
||||
def language(self, lang: str) -> dict[str, Any]:
|
||||
try:
|
||||
return dict(self.languages[lang])
|
||||
except KeyError as e:
|
||||
available = ", ".join(sorted(self.languages))
|
||||
raise ValueError(f"Unsupported language '{lang}'. Available: {available}") from e
|
||||
|
||||
def language_name(self, lang: str) -> str:
|
||||
return str(self.language(lang)["name"])
|
||||
|
||||
def transcript_code(self, lang: str) -> str:
|
||||
return str(self.language(lang)["transcript_code"])
|
||||
|
||||
def decks_for(self, lang: str) -> list[str]:
|
||||
return list(self.language(lang).get("decks", []))
|
||||
|
||||
def field_for(self, lang: str) -> str:
|
||||
return str(self.language(lang).get("field", self.fields.get("back", "Back")))
|
||||
|
||||
def sentence_file_for(self, lang: str) -> str:
|
||||
value = str(self.language(lang).get("sentence_file", f"sentences_{lang}.txt"))
|
||||
return expand_path(value if os.path.isabs(value) or value.startswith("~") else os.path.join(self.sentence_dir, value))
|
||||
|
||||
|
||||
def expand_path(path: str) -> str:
|
||||
return os.path.expanduser(os.path.expandvars(path))
|
||||
|
||||
|
||||
def default_config_path() -> str:
|
||||
return expand_path("~/.config/saiki/config.yaml")
|
||||
|
||||
|
||||
def deep_merge(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
|
||||
result = copy.deepcopy(base)
|
||||
for key, value in override.items():
|
||||
if isinstance(value, dict) and isinstance(result.get(key), dict):
|
||||
result[key] = deep_merge(result[key], value)
|
||||
else:
|
||||
result[key] = value
|
||||
return result
|
||||
|
||||
|
||||
def load_config(path: str | None = None) -> Config:
|
||||
config = copy.deepcopy(DEFAULT_CONFIG)
|
||||
config_path = expand_path(path) if path else default_config_path()
|
||||
if os.path.exists(config_path):
|
||||
if yaml is None:
|
||||
raise RuntimeError("Loading config files requires PyYAML. Install pyyaml.")
|
||||
with open(config_path, "r", encoding="utf-8") as fh:
|
||||
loaded = yaml.safe_load(fh) or {}
|
||||
if not isinstance(loaded, dict):
|
||||
raise RuntimeError(f"Config must be a YAML mapping: {config_path}")
|
||||
config = deep_merge(config, loaded)
|
||||
return Config(config)
|
||||
|
||||
|
||||
def language_choices(config: Config) -> list[str]:
|
||||
return sorted(config.languages.keys())
|
||||
112
saiki/importer.py
Normal file
112
saiki/importer.py
Normal file
@@ -0,0 +1,112 @@
|
||||
"""Generate TTS audio and add sentence notes to Anki."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import csv
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Callable
|
||||
|
||||
from .ankiconnect import anki_request
|
||||
from .config import Config
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ImportResult:
|
||||
processed: int
|
||||
added: int
|
||||
failed: int
|
||||
|
||||
|
||||
def parse_tags(value: str | None) -> list[str]:
|
||||
tags = ["text-to-speech"]
|
||||
if value:
|
||||
tags.extend(tag.strip() for tag in value.split(",") if tag.strip())
|
||||
else:
|
||||
tags.append("AI-generated")
|
||||
return tags
|
||||
|
||||
|
||||
def require_command(name: str) -> None:
|
||||
if shutil.which(name) is None:
|
||||
raise RuntimeError(f"Required command not found: {name}")
|
||||
|
||||
|
||||
def generate_tts(sentence: str, raw_output: str, lang_code: str, tld: str) -> None:
|
||||
subprocess.run(["gtts-cli", sentence, "--lang", lang_code, "--tld", tld, "--output", raw_output], check=True)
|
||||
|
||||
|
||||
def speed_audio(raw_output: str, output_path: str, tempo: float) -> None:
|
||||
subprocess.run(
|
||||
["ffmpeg", "-loglevel", "error", "-i", raw_output, "-filter:a", f"atempo={tempo}", "-y", output_path],
|
||||
stdin=subprocess.DEVNULL,
|
||||
check=True,
|
||||
)
|
||||
|
||||
|
||||
def read_sentences(path: str) -> list[str]:
|
||||
expanded = os.path.expanduser(path)
|
||||
if expanded.lower().endswith((".tsv", ".csv")):
|
||||
delimiter = "\t" if expanded.lower().endswith(".tsv") else ","
|
||||
with open(expanded, "r", encoding="utf-8", newline="") as fh:
|
||||
reader = csv.DictReader(fh, delimiter=delimiter)
|
||||
if reader.fieldnames and "sentence" in reader.fieldnames:
|
||||
return [row["sentence"].strip() for row in reader if row.get("sentence", "").strip()]
|
||||
raise RuntimeError("TSV/CSV sentence imports must include a 'sentence' header.")
|
||||
|
||||
with open(expanded, "r", encoding="utf-8") as fh:
|
||||
return [line.strip() for line in fh if line.strip()]
|
||||
|
||||
|
||||
def import_sentences(
|
||||
config: Config,
|
||||
lang: str,
|
||||
sentence_file: str | None = None,
|
||||
tags_value: str | None = None,
|
||||
request: Callable = anki_request,
|
||||
) -> ImportResult:
|
||||
require_command("gtts-cli")
|
||||
require_command("ffmpeg")
|
||||
|
||||
language = config.language(lang)
|
||||
decks = list(language.get("decks", []))
|
||||
if not decks:
|
||||
raise RuntimeError(f"No deck configured for language: {lang}")
|
||||
deck = decks[0]
|
||||
|
||||
source = os.path.expanduser(sentence_file) if sentence_file else config.sentence_file_for(lang)
|
||||
sentences = read_sentences(source)
|
||||
tags = parse_tags(tags_value)
|
||||
front_field = config.fields.get("front", "Front")
|
||||
back_field = config.fields.get("back", "Back")
|
||||
added = 0
|
||||
failed = 0
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
for sentence in sentences:
|
||||
basename = f"tts_{time.strftime('%Y%m%d_%H%M%S')}_{lang}_{os.getpid()}_{added + failed}"
|
||||
raw_output = os.path.join(temp_dir, f"{basename}_original.mp3")
|
||||
output_path = os.path.join(temp_dir, f"{basename}.mp3")
|
||||
try:
|
||||
generate_tts(sentence, raw_output, str(language["tts_code"]), str(language["tts_tld"]))
|
||||
speed_audio(raw_output, output_path, float(language["tts_tempo"]))
|
||||
request(
|
||||
"addNote",
|
||||
url=config.anki_connect_url,
|
||||
note={
|
||||
"deckName": deck,
|
||||
"modelName": config.note_model,
|
||||
"fields": {front_field: "", back_field: sentence},
|
||||
"options": {"allowDuplicate": False},
|
||||
"tags": tags,
|
||||
"audio": [{"path": output_path, "filename": f"{basename}.mp3", "fields": [front_field]}],
|
||||
},
|
||||
)
|
||||
added += 1
|
||||
except Exception:
|
||||
failed += 1
|
||||
return ImportResult(processed=len(sentences), added=added, failed=failed)
|
||||
29
saiki/text.py
Normal file
29
saiki/text.py
Normal file
@@ -0,0 +1,29 @@
|
||||
"""Text cleanup helpers shared by tools."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from html import unescape
|
||||
|
||||
import regex as re
|
||||
|
||||
|
||||
def extract_first_visible_line(text: str) -> str:
|
||||
text = unescape(text or "")
|
||||
text = re.sub(r"</?(br|div|p)[^>]*>", "\n", text, flags=re.IGNORECASE)
|
||||
text = re.sub(r"<[^>]+>", "", text)
|
||||
text = text.strip()
|
||||
return text.splitlines()[0] if text else ""
|
||||
|
||||
|
||||
def extract_visible_text(text: str) -> str:
|
||||
text = unescape(text or "")
|
||||
text = re.sub(r"</?(br|div|p)[^>]*>", "\n", text, flags=re.IGNORECASE)
|
||||
text = re.sub(r"<[^>]+>", "", text)
|
||||
text = re.sub(r"[ \t]+", " ", text)
|
||||
text = re.sub(r"\n{2,}", "\n", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def normalize_word_key(value: str) -> str:
|
||||
return re.sub(r"\s+", " ", value.strip().lower())
|
||||
|
||||
183
saiki/words.py
Normal file
183
saiki/words.py
Normal file
@@ -0,0 +1,183 @@
|
||||
"""Extract and compare language-learning vocabulary."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
from collections import Counter
|
||||
from typing import Callable
|
||||
|
||||
import regex as re
|
||||
|
||||
from .ankiconnect import anki_request
|
||||
from .config import Config
|
||||
from .text import extract_first_visible_line, extract_visible_text, normalize_word_key
|
||||
|
||||
JAPANESE_CHAR_RE = re.compile(r"[\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Han}ー]+")
|
||||
JAPANESE_PARTICLES = {
|
||||
"は", "が", "を", "に", "へ", "で", "と", "や", "も", "から", "まで", "より", "ば", "なら",
|
||||
"の", "ね", "よ", "ぞ", "ぜ", "さ", "わ", "か", "な", "って", "とき", "ってば", "けど", "けれど",
|
||||
"しかし", "でも", "ながら", "ほど", "し", "もの", "こと", "ところ", "よう", "らしい", "られる",
|
||||
}
|
||||
JAPANESE_GRAMMAR_EXCLUDE = {
|
||||
"て", "た", "ます", "れる", "てる", "ぬ", "ん", "しまう", "いる", "ない", "なる", "ある", "だ", "です",
|
||||
}
|
||||
JAPANESE_ALLOWED_POS = {"NOUN", "PROPN", "VERB", "ADJ"}
|
||||
|
||||
|
||||
def setup_logging(logfile: str) -> None:
|
||||
os.makedirs(os.path.dirname(os.path.abspath(logfile)), exist_ok=True)
|
||||
logging.basicConfig(filename=logfile, level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
||||
|
||||
|
||||
def build_query_from_decks(decks: list[str]) -> str:
|
||||
return " OR ".join(f'deck:"{d}"' for d in decks)
|
||||
|
||||
|
||||
def japanese_filter(token) -> bool:
|
||||
text = (token.text or "").strip()
|
||||
lemma = (token.lemma_ or "").strip()
|
||||
if not text or not JAPANESE_CHAR_RE.fullmatch(text):
|
||||
return False
|
||||
if lemma in JAPANESE_GRAMMAR_EXCLUDE or text in JAPANESE_PARTICLES:
|
||||
return False
|
||||
if getattr(token, "pos_", None) not in JAPANESE_ALLOWED_POS:
|
||||
return False
|
||||
if getattr(token, "is_stop", False) or getattr(token, "like_url", False) or getattr(token, "like_email", False):
|
||||
return False
|
||||
if any(c in text for c in "<>=/\\:&%"):
|
||||
return False
|
||||
return text not in {"ruby", "rt", "div", "br", "nbsp", "href", "strong", "a"}
|
||||
|
||||
|
||||
def spanish_filter(token) -> bool:
|
||||
return bool(getattr(token, "is_alpha", False)) and not bool(getattr(token, "is_stop", False))
|
||||
|
||||
|
||||
def spanish_format(token) -> str:
|
||||
return (token.lemma_ or token.text or "").lower().strip()
|
||||
|
||||
|
||||
def japanese_format(token) -> str:
|
||||
lemma = (token.lemma_ or "").strip()
|
||||
surface = (token.text or "").strip()
|
||||
if lemma and surface and lemma != surface:
|
||||
return f"{lemma} ({surface})"
|
||||
return lemma or surface
|
||||
|
||||
|
||||
LANGUAGE_PROFILES = {
|
||||
"spanish": {"token_filter": spanish_filter, "output_format": spanish_format},
|
||||
"japanese": {"token_filter": japanese_filter, "output_format": japanese_format},
|
||||
}
|
||||
|
||||
|
||||
def load_spacy_model(model_name: str):
|
||||
try:
|
||||
import spacy # type: ignore
|
||||
except Exception as e:
|
||||
raise RuntimeError("Failed to import spaCy. Use a Python version supported by spaCy.") from e
|
||||
try:
|
||||
return spacy.load(model_name)
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to load spaCy model '{model_name}'. Try: python -m spacy download {model_name}") from e
|
||||
|
||||
|
||||
def get_notes(query: str, config: Config, request: Callable = anki_request) -> list[dict]:
|
||||
note_ids = request("findNotes", url=config.anki_connect_url, query=query) or []
|
||||
if not note_ids:
|
||||
return []
|
||||
return request("notesInfo", url=config.anki_connect_url, notes=note_ids) or []
|
||||
|
||||
|
||||
def extract_counts(
|
||||
notes: list[dict],
|
||||
field_name: str,
|
||||
nlp,
|
||||
token_filter: Callable,
|
||||
output_format: Callable,
|
||||
use_full_field: bool,
|
||||
) -> Counter:
|
||||
counter: Counter = Counter()
|
||||
for note in notes:
|
||||
fields = note.get("fields", {}) or {}
|
||||
raw_val = (fields.get(field_name, {}) or {}).get("value", "") or ""
|
||||
text = extract_visible_text(raw_val) if use_full_field else extract_first_visible_line(raw_val)
|
||||
if not text:
|
||||
continue
|
||||
for token in nlp(text):
|
||||
if token_filter(token):
|
||||
key = output_format(token)
|
||||
if key:
|
||||
counter[key] += 1
|
||||
return counter
|
||||
|
||||
|
||||
def write_counts(counter: Counter, out_path: str, min_freq: int) -> int:
|
||||
items = [(w, c) for (w, c) in counter.items() if c >= min_freq]
|
||||
items.sort(key=lambda x: (-x[1], x[0]))
|
||||
os.makedirs(os.path.dirname(os.path.abspath(out_path)), exist_ok=True)
|
||||
with open(out_path, "w", encoding="utf-8") as f:
|
||||
for word, freq in items:
|
||||
f.write(f"{word} {freq}\n")
|
||||
return len(items)
|
||||
|
||||
|
||||
def read_word_file(path: str) -> set[str]:
|
||||
words: set[str] = set()
|
||||
with open(os.path.expanduser(path), "r", encoding="utf-8") as fh:
|
||||
for line in fh:
|
||||
stripped = line.strip()
|
||||
if not stripped:
|
||||
continue
|
||||
word = stripped.rsplit(" ", 1)[0]
|
||||
words.add(normalize_word_key(word))
|
||||
return words
|
||||
|
||||
|
||||
def compare_word_files(source_path: str, known_path: str) -> list[str]:
|
||||
known = read_word_file(known_path)
|
||||
new_words: list[str] = []
|
||||
with open(os.path.expanduser(source_path), "r", encoding="utf-8") as fh:
|
||||
for line in fh:
|
||||
stripped = line.strip()
|
||||
if not stripped:
|
||||
continue
|
||||
word = stripped.rsplit(" ", 1)[0]
|
||||
if normalize_word_key(word) not in known:
|
||||
new_words.append(stripped)
|
||||
return new_words
|
||||
|
||||
|
||||
def extract_words(
|
||||
config: Config,
|
||||
lang: str,
|
||||
query: str | None = None,
|
||||
decks: list[str] | None = None,
|
||||
field: str | None = None,
|
||||
min_freq: int = 2,
|
||||
outdir: str | None = None,
|
||||
out: str | None = None,
|
||||
full_field: bool = False,
|
||||
spacy_model: str | None = None,
|
||||
request: Callable = anki_request,
|
||||
) -> dict[str, object]:
|
||||
language_bucket = config.language_name(lang)
|
||||
profile = LANGUAGE_PROFILES[language_bucket]
|
||||
search_query = query or build_query_from_decks(decks or config.decks_for(lang))
|
||||
out_dir = os.path.expanduser(outdir) if outdir else os.path.join(config.word_output_root, language_bucket)
|
||||
out_path = os.path.expanduser(out) if out else os.path.join(out_dir, f"words_{lang}.txt")
|
||||
model_name = spacy_model or str(config.language(lang).get("word_model"))
|
||||
nlp = load_spacy_model(model_name)
|
||||
notes = get_notes(search_query, config, request=request)
|
||||
if notes:
|
||||
fields0 = (notes[0].get("fields", {}) or {})
|
||||
field_name = field or config.field_for(lang)
|
||||
if field_name not in fields0:
|
||||
raise RuntimeError(f"Field '{field_name}' not found. Available fields: {list(fields0.keys())}")
|
||||
else:
|
||||
field_name = field or config.field_for(lang)
|
||||
counter = extract_counts(notes, field_name, nlp, profile["token_filter"], profile["output_format"], full_field)
|
||||
written = write_counts(counter, out_path, min_freq)
|
||||
return {"query": search_query, "notes": len(notes), "unique": len(counter), "written": written, "out": out_path}
|
||||
|
||||
179
saiki/youtube.py
Normal file
179
saiki/youtube.py
Normal file
@@ -0,0 +1,179 @@
|
||||
"""YouTube transcript mining and Anki-ready exports."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import os
|
||||
import re
|
||||
from collections import Counter
|
||||
from dataclasses import dataclass
|
||||
from urllib.parse import parse_qs, urlparse
|
||||
|
||||
from youtube_transcript_api import YouTubeTranscriptApi
|
||||
|
||||
from .config import Config
|
||||
from .text import normalize_word_key
|
||||
from .words import read_word_file
|
||||
|
||||
STOPWORDS = {
|
||||
"es": {
|
||||
"de", "la", "que", "el", "en", "y", "a", "los", "del", "se", "las", "por",
|
||||
"un", "para", "con", "no", "una", "su", "al", "lo", "como",
|
||||
},
|
||||
"en": {"the", "is", "and", "of", "to", "in", "it", "that", "on", "you", "this", "for", "with"},
|
||||
"ja": {"の", "に", "は", "を", "た", "が", "で", "て", "です", "ます", "する", "ある", "いる"},
|
||||
}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class TranscriptLine:
|
||||
start: float
|
||||
text: str
|
||||
|
||||
|
||||
def extract_video_id(url_or_id: str) -> str:
|
||||
if "youtube" in url_or_id or "youtu.be" in url_or_id:
|
||||
query = urlparse(url_or_id)
|
||||
if query.hostname == "youtu.be":
|
||||
return query.path.lstrip("/")
|
||||
if query.hostname in ("www.youtube.com", "youtube.com", "m.youtube.com"):
|
||||
values = parse_qs(query.query).get("v", [])
|
||||
if values:
|
||||
return values[0]
|
||||
return url_or_id
|
||||
|
||||
|
||||
def video_url(video_or_id: str) -> str:
|
||||
video_id = extract_video_id(video_or_id)
|
||||
return f"https://www.youtube.com/watch?v={video_id}"
|
||||
|
||||
|
||||
def fetch_transcript(video_id: str, lang_code: str):
|
||||
if hasattr(YouTubeTranscriptApi, "fetch"):
|
||||
api = YouTubeTranscriptApi()
|
||||
return api.fetch(video_id, languages=[lang_code])
|
||||
if hasattr(YouTubeTranscriptApi, "get_transcript"):
|
||||
return YouTubeTranscriptApi.get_transcript(video_id, languages=[lang_code])
|
||||
raise RuntimeError("Unsupported youtube-transcript-api version.")
|
||||
|
||||
|
||||
def snippet_text(entry) -> str:
|
||||
if isinstance(entry, dict):
|
||||
return entry.get("text", "") or ""
|
||||
return getattr(entry, "text", "") or ""
|
||||
|
||||
|
||||
def snippet_start(entry) -> float:
|
||||
if isinstance(entry, dict):
|
||||
return float(entry.get("start", 0.0) or 0.0)
|
||||
return float(getattr(entry, "start", 0.0) or 0.0)
|
||||
|
||||
|
||||
def transcript_lines(entries) -> list[TranscriptLine]:
|
||||
lines: list[TranscriptLine] = []
|
||||
for entry in entries:
|
||||
text = snippet_text(entry).replace("\n", " ").strip()
|
||||
if text:
|
||||
lines.append(TranscriptLine(snippet_start(entry), text))
|
||||
return lines
|
||||
|
||||
|
||||
def tokenize_japanese(text: str) -> list[str]:
|
||||
try:
|
||||
from fugashi import Tagger
|
||||
except ImportError as e:
|
||||
raise RuntimeError('Japanese requires fugashi. Install: pip install "fugashi[unidic-lite]"') from e
|
||||
tagger = Tagger()
|
||||
return [w.surface for w in tagger(text)]
|
||||
|
||||
|
||||
def tokenize_spanish(text: str, raw: bool = False) -> list[str]:
|
||||
tokens = re.findall(r"\b[\wáéíóúñü]+\b", text)
|
||||
return tokens if raw else [t.lower() for t in tokens]
|
||||
|
||||
|
||||
def tokenize_text(text: str, lang_code: str, raw: bool = False) -> list[str]:
|
||||
return tokenize_japanese(text) if lang_code == "ja" else tokenize_spanish(text, raw=raw)
|
||||
|
||||
|
||||
def count_words(tokens: list[str], lang_code: str, remove_stopwords: bool = True) -> Counter:
|
||||
if remove_stopwords:
|
||||
stopwords = STOPWORDS.get(lang_code, set())
|
||||
tokens = [t for t in tokens if t not in stopwords]
|
||||
return Counter(tokens)
|
||||
|
||||
|
||||
def sentence_vocab(sentence: str, lang_code: str, known_words: set[str] | None = None) -> list[str]:
|
||||
words: list[str] = []
|
||||
seen: set[str] = set()
|
||||
for token in tokenize_text(sentence, lang_code):
|
||||
key = normalize_word_key(token)
|
||||
if key in seen or key in STOPWORDS.get(lang_code, set()):
|
||||
continue
|
||||
if known_words is not None and key in known_words:
|
||||
continue
|
||||
seen.add(key)
|
||||
words.append(token)
|
||||
return words
|
||||
|
||||
|
||||
def write_sentence_export(
|
||||
lines: list[TranscriptLine],
|
||||
out_path: str,
|
||||
video: str,
|
||||
lang_code: str,
|
||||
delimiter: str = "\t",
|
||||
known_words_path: str | None = None,
|
||||
only_new: bool = False,
|
||||
) -> int:
|
||||
known = read_word_file(known_words_path) if known_words_path else None
|
||||
os.makedirs(os.path.dirname(os.path.abspath(out_path)), exist_ok=True)
|
||||
written = 0
|
||||
with open(out_path, "w", encoding="utf-8", newline="") as fh:
|
||||
writer = csv.writer(fh, delimiter=delimiter)
|
||||
writer.writerow(["sentence", "timestamp", "video_url", "vocab_guess"])
|
||||
for line in lines:
|
||||
vocab = sentence_vocab(line.text, lang_code, known)
|
||||
if only_new and not vocab:
|
||||
continue
|
||||
writer.writerow([line.text, f"{line.start:.2f}", video_url(video), ", ".join(vocab)])
|
||||
written += 1
|
||||
return written
|
||||
|
||||
|
||||
def run_youtube(
|
||||
config: Config,
|
||||
lang: str,
|
||||
video: str,
|
||||
mode: str = "vocab",
|
||||
top: int | None = None,
|
||||
no_stopwords: bool = False,
|
||||
raw: bool = False,
|
||||
out: str | None = None,
|
||||
fmt: str = "tsv",
|
||||
known_words: str | None = None,
|
||||
only_new: bool = False,
|
||||
) -> dict[str, object]:
|
||||
lang_code = config.transcript_code(lang)
|
||||
video_id = extract_video_id(video)
|
||||
entries = fetch_transcript(video_id, lang_code)
|
||||
lines = transcript_lines(entries)
|
||||
|
||||
if mode == "sentences":
|
||||
if out:
|
||||
delimiter = "," if fmt == "csv" else "\t"
|
||||
written = write_sentence_export(lines, out, video_id, lang_code, delimiter, known_words, only_new)
|
||||
return {"mode": mode, "lines": len(lines), "written": written, "out": out}
|
||||
return {"mode": mode, "lines": lines}
|
||||
|
||||
text = " ".join(line.text for line in lines)
|
||||
tokens = tokenize_text(text, lang_code, raw=raw)
|
||||
counts = count_words(tokens, lang_code, remove_stopwords=not no_stopwords)
|
||||
items = counts.most_common(top) if top else counts.most_common()
|
||||
if out:
|
||||
os.makedirs(os.path.dirname(os.path.abspath(out)), exist_ok=True)
|
||||
with open(out, "w", encoding="utf-8") as fh:
|
||||
for word, count in items:
|
||||
fh.write(f"{word} {count}\n")
|
||||
return {"mode": mode, "items": items, "out": out}
|
||||
|
||||
Reference in New Issue
Block a user