Rename project to Saiki and unify CLI

This commit is contained in:
Pawel
2026-05-26 18:09:26 -04:00
parent 8ee1f8de25
commit f38030238c
19 changed files with 1274 additions and 1326 deletions

183
saiki/words.py Normal file
View File

@@ -0,0 +1,183 @@
"""Extract and compare language-learning vocabulary."""
from __future__ import annotations
import logging
import os
from collections import Counter
from typing import Callable
import regex as re
from .ankiconnect import anki_request
from .config import Config
from .text import extract_first_visible_line, extract_visible_text, normalize_word_key
JAPANESE_CHAR_RE = re.compile(r"[\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Han}ー]+")
JAPANESE_PARTICLES = {
"", "", "", "", "", "", "", "", "", "から", "まで", "より", "", "なら",
"", "", "", "", "", "", "", "", "", "って", "とき", "ってば", "けど", "けれど",
"しかし", "でも", "ながら", "ほど", "", "もの", "こと", "ところ", "よう", "らしい", "られる",
}
JAPANESE_GRAMMAR_EXCLUDE = {
"", "", "ます", "れる", "てる", "", "", "しまう", "いる", "ない", "なる", "ある", "", "です",
}
JAPANESE_ALLOWED_POS = {"NOUN", "PROPN", "VERB", "ADJ"}
def setup_logging(logfile: str) -> None:
os.makedirs(os.path.dirname(os.path.abspath(logfile)), exist_ok=True)
logging.basicConfig(filename=logfile, level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
def build_query_from_decks(decks: list[str]) -> str:
return " OR ".join(f'deck:"{d}"' for d in decks)
def japanese_filter(token) -> bool:
text = (token.text or "").strip()
lemma = (token.lemma_ or "").strip()
if not text or not JAPANESE_CHAR_RE.fullmatch(text):
return False
if lemma in JAPANESE_GRAMMAR_EXCLUDE or text in JAPANESE_PARTICLES:
return False
if getattr(token, "pos_", None) not in JAPANESE_ALLOWED_POS:
return False
if getattr(token, "is_stop", False) or getattr(token, "like_url", False) or getattr(token, "like_email", False):
return False
if any(c in text for c in "<>=/\\:&%"):
return False
return text not in {"ruby", "rt", "div", "br", "nbsp", "href", "strong", "a"}
def spanish_filter(token) -> bool:
return bool(getattr(token, "is_alpha", False)) and not bool(getattr(token, "is_stop", False))
def spanish_format(token) -> str:
return (token.lemma_ or token.text or "").lower().strip()
def japanese_format(token) -> str:
lemma = (token.lemma_ or "").strip()
surface = (token.text or "").strip()
if lemma and surface and lemma != surface:
return f"{lemma} ({surface})"
return lemma or surface
LANGUAGE_PROFILES = {
"spanish": {"token_filter": spanish_filter, "output_format": spanish_format},
"japanese": {"token_filter": japanese_filter, "output_format": japanese_format},
}
def load_spacy_model(model_name: str):
try:
import spacy # type: ignore
except Exception as e:
raise RuntimeError("Failed to import spaCy. Use a Python version supported by spaCy.") from e
try:
return spacy.load(model_name)
except Exception as e:
raise RuntimeError(f"Failed to load spaCy model '{model_name}'. Try: python -m spacy download {model_name}") from e
def get_notes(query: str, config: Config, request: Callable = anki_request) -> list[dict]:
note_ids = request("findNotes", url=config.anki_connect_url, query=query) or []
if not note_ids:
return []
return request("notesInfo", url=config.anki_connect_url, notes=note_ids) or []
def extract_counts(
notes: list[dict],
field_name: str,
nlp,
token_filter: Callable,
output_format: Callable,
use_full_field: bool,
) -> Counter:
counter: Counter = Counter()
for note in notes:
fields = note.get("fields", {}) or {}
raw_val = (fields.get(field_name, {}) or {}).get("value", "") or ""
text = extract_visible_text(raw_val) if use_full_field else extract_first_visible_line(raw_val)
if not text:
continue
for token in nlp(text):
if token_filter(token):
key = output_format(token)
if key:
counter[key] += 1
return counter
def write_counts(counter: Counter, out_path: str, min_freq: int) -> int:
items = [(w, c) for (w, c) in counter.items() if c >= min_freq]
items.sort(key=lambda x: (-x[1], x[0]))
os.makedirs(os.path.dirname(os.path.abspath(out_path)), exist_ok=True)
with open(out_path, "w", encoding="utf-8") as f:
for word, freq in items:
f.write(f"{word} {freq}\n")
return len(items)
def read_word_file(path: str) -> set[str]:
words: set[str] = set()
with open(os.path.expanduser(path), "r", encoding="utf-8") as fh:
for line in fh:
stripped = line.strip()
if not stripped:
continue
word = stripped.rsplit(" ", 1)[0]
words.add(normalize_word_key(word))
return words
def compare_word_files(source_path: str, known_path: str) -> list[str]:
known = read_word_file(known_path)
new_words: list[str] = []
with open(os.path.expanduser(source_path), "r", encoding="utf-8") as fh:
for line in fh:
stripped = line.strip()
if not stripped:
continue
word = stripped.rsplit(" ", 1)[0]
if normalize_word_key(word) not in known:
new_words.append(stripped)
return new_words
def extract_words(
config: Config,
lang: str,
query: str | None = None,
decks: list[str] | None = None,
field: str | None = None,
min_freq: int = 2,
outdir: str | None = None,
out: str | None = None,
full_field: bool = False,
spacy_model: str | None = None,
request: Callable = anki_request,
) -> dict[str, object]:
language_bucket = config.language_name(lang)
profile = LANGUAGE_PROFILES[language_bucket]
search_query = query or build_query_from_decks(decks or config.decks_for(lang))
out_dir = os.path.expanduser(outdir) if outdir else os.path.join(config.word_output_root, language_bucket)
out_path = os.path.expanduser(out) if out else os.path.join(out_dir, f"words_{lang}.txt")
model_name = spacy_model or str(config.language(lang).get("word_model"))
nlp = load_spacy_model(model_name)
notes = get_notes(search_query, config, request=request)
if notes:
fields0 = (notes[0].get("fields", {}) or {})
field_name = field or config.field_for(lang)
if field_name not in fields0:
raise RuntimeError(f"Field '{field_name}' not found. Available fields: {list(fields0.keys())}")
else:
field_name = field or config.field_for(lang)
counter = extract_counts(notes, field_name, nlp, profile["token_filter"], profile["output_format"], full_field)
written = write_counts(counter, out_path, min_freq)
return {"query": search_query, "notes": len(notes), "unique": len(counter), "written": written, "out": out_path}