From 14b9e814da6434eab79bd5268346ed8b303b71da Mon Sep 17 00:00:00 2001 From: Pawel Date: Thu, 22 Jan 2026 21:04:23 -0500 Subject: [PATCH] initial --- README.md | 293 +++++++++++++++++++++++ audio_extractor.py | 254 ++++++++++++++++++++ batch_anki_import.sh | 141 +++++++++++ figures/anki_basic_card_jp.png | Bin 0 -> 30872 bytes word_scraper.py | 422 +++++++++++++++++++++++++++++++++ yt-transcript.py | 199 ++++++++++++++++ 6 files changed, 1309 insertions(+) create mode 100644 README.md create mode 100755 audio_extractor.py create mode 100755 batch_anki_import.sh create mode 100644 figures/anki_basic_card_jp.png create mode 100755 word_scraper.py create mode 100755 yt-transcript.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..4720471 --- /dev/null +++ b/README.md @@ -0,0 +1,293 @@ +# Anki tools for language learning + +A modular collection of tools and scripts to enhance your anki-based language learning. These tools focus on listening, sentence mining, sentence decks, and more. Built for language learners and immersion enthusiasts. + +### Tools Overview + +| Tool | Purpose | +|---------------------------------|-------------------------------------------------------------------------| +| `audio-extractor` | Extract Anki card audio by language into playlists for passive listening | +| `batch_importer` | Generate TTS audio from sentence lists and import into Anki | +| `word-scraper` | Extract & lemmatize words from Anki decks (frequency analysis, mining) | +| `yt-transcript` | Mine vocabulary/sentences from YouTube transcripts for analysis | +| `deck-converter`* | Convert TSV+audio into `.apkg` Anki decks using config-driven workflow | +| `youtube-to-anki`* | Convert YouTube subtitles/audio into fully timestamped Anki cards | + +*=coming soon + +### Requirements + +Each tool has its own set of dependencies. Common dependencies includes +- Python3 +- [Anki](https://apps.ankiweb.net/) with [AnkiConnect](https://github.com/amikey/anki-connect) +- `yt-dlp`, `jq`, `yq`, `spaCy`, `gTTS`, `youtube-transcript-api`, `pyyaml`, `genanki`, `fugashi`, `regex`, `requests` +- `ffmpeg` + +Personally, I like to have on venv that contains all the prerequisites. + +```shell +python3.12 -m venv ~/.venv/anki-tools +source ~/.venv/anki-tools/bin/activate +python3 -m pip install -U pip +pip install gtts jq yq spacy youtube-transcript-api pyyaml genanki fugashi regex requests + +# Also install ffmpeg +sudo dnf install ffmpeg +``` +That way, whenever you want to run these scripts, you can just source the venv and run the appropriate script. + +```shell +source ~/.venv/anki-tools/bin/activate +``` + +### Getting started + +Clone the repository: +```shell +git clone https://git.pawelsarkowicz.xyz/ps/anki-tools.git +cd anki-tools +``` +Then explore. + +Most scripts assume: +- Anki is running +- the AnkiConnect add-on is enabled (default: http://localhost:8765) +- that your anki cards are basic, with audio on the front and the sentence (in the target language) on the back. These tools only look at the first line of the back, so you can have notes/translations/etc. on the following lines if you like. +![anki_basic_card_jp](./figures/anki_basic_card_jp.png) + +### Language support +- 🇯🇵 Japanese +- 🇪🇸 Spanish +- 🇬🇧 English + + +## audio-extractor +**Purpose**: Extract audio referenced by `[sound:...]` tags from Anki decks, grouped by language. + +### Usage: + +```bash +./extract_anki_audio.py jp [--concat] [--outdir DIR] [--copy-only-new] +./extract_anki_audio.py es [--concat] [--outdir DIR] [--copy-only-new] +``` + +Outputs: +- Copies audio into `~/Documents/anki-audio//` by default +- Writes `.m3u` +- With `--concat`, writes `_concat.mp3` (keeps individual files) + +### Requirements +- Anki + AnkiConnect +- `requests` +- `ffmpeg` (only if you use `--concat`) + +## batch_importer +**Purpose**: Generate TTS audio from a sentence list and add notes to Anki via AnkiConnect. + +### Usage + +```bash +./batch_anki_import.sh [jp|es] [--concat] [--outdir DIR] +``` + +- Keeps all individual MP3s. +- If `--concat` is passed, also writes one combined MP3 for the run. + +### Requirements +- Anki + AnkiConnect +- `gtts-cli`, `ffmpeg`, `curl` + +### Sentence files +- Japanese: `~/Documents/sentences_jp.txt` +- Spanish: `~/Documents/sentences_es.txt` + +## word-scraper + +Extract frequent words from Anki notes using **AnkiConnect** and **spaCy**. +This is primarily intended for language learning workflows (currently Japanese and Spanish). + +The script: +- queries notes from Anki +- extracts visible text from a chosen field +- tokenizes with spaCy +- filters out stopwords / grammar +- counts word frequencies +- writes a sorted word list to a text file + + +### Requirements + +- Anki + AnkiConnect - Python **3.12** (recommended; spaCy is not yet stable on 3.14) +- `spacy`, `regex`, `requests` +- spaCy models: +```bash +python -m spacy download es_core_news_sm +python -m spacy download ja_core_news_lg +``` + +### Usage +```bash +./word_scraper.py {jp,es} [options] +``` + +| Option | Description | +| --------------------- | -------------------------------------------------------------------- | +| `--query QUERY` | Full Anki search query (e.g. `deck:"Español" tag:foo`) | +| `--deck DECK` | Deck name (repeatable). If omitted, decks are inferred from language | +| `--field FIELD` | Note field to read (default: `Back`) | +| `--min-freq N` | Minimum frequency to include (default: `2`) | +| `--outdir DIR` | Output directory (default: `~/Documents/anki-words/`) | +| `--out FILE` | Output file path (default: `/words_.txt`) | +| `--full-field` | Use full field text instead of only the first visible line | +| `--spacy-model MODEL` | Override spaCy model name | +| `--logfile FILE` | Log file path | + +### Examples +#### Basic usage (auto-detected decks) +```bash +./word_scraper.py jp +./word_scraper.py es +``` + +#### Specify a deck explicitly +```bash +./word_scraper.py jp --deck "日本語" +./word_scraper.py es --deck "Español" +``` + +#### Use a custom Anki query +```bash +./word_scraper.py es --query 'deck:"Español" tag:youtube' +``` + +#### Change output location and frequency threshold +```bash +./word_scraper.py jp --min-freq 3 --out words_jp.txt +./word_scraper.py es --outdir ~/tmp/words --out spanish_words.txt +``` + +#### Process full field text (not just first line) +```bash +./word_scraper.py jp --full-field +``` + +### Output format +The output file contains one entry per line: +``` +word frequency +``` +Examples: +``` +comer 12 +hablar 9 +行く (行き) 8 +見る (見た) 6 +``` + +- Spanish output uses lemmas +- Japanese output includes lemma (surface) when they differ + +### Language-specific notes +#### Japanese +- Filters out particles and common grammar +- Keeps nouns, verbs, adjectives, and proper nouns +- Requires `regex` for Unicode script matching + +#### Spanish +- Filters stopwords +- Keeps alphabetic tokens only +- Lemmatized output + +## yt-transcript +Extract vocabulary or sentence-level text from YouTube video subtitles (transcripts), for language learning or analysis. + +The script: +- fetches captions via `youtube-transcript-api` +- supports **Spanish (es)** and **Japanese (jp)** +- tokenizes Japanese using **MeCab (via fugashi)** +- outputs either: + - word frequency lists, or + - timestamped transcript lines + +### Features + +- Extract full vocabulary lists with frequency counts +- Extract sentences (with timestamps or sentence indices) +- Support for Japanese tokenization +- Optional: stopword filtering +- Modular and extendable for future features like CSV export or audio slicing + +### Requirements +- `youtube-transcript-api` +- For Japanese tokenization: +``` +pip install "fugashi[unidic-lite]" +``` + +### Usage +```shell +./yt-transcript.py {jp,es} [options] +``` + +### Options +| Option | Description | +| -------------------------- | -------------------------------------- | +| `--mode {vocab,sentences}` | Output mode (default: `vocab`) | +| `--top N` | Show only the top N words (vocab mode) | +| `--no-stopwords` | Keep common words | +| `--raw` | (Spanish only) Do not lowercase tokens | + + +### Examples +#### Extract Spanish vocabulary +```bash +./yt-transcript.py es https://youtu.be/VIDEO_ID +``` + +#### Top 50 words +```bash +./yt-transcript.py es VIDEO_ID --top 50 +``` + +#### Japanese transcript with timestamps +```bash +./yt-transcript.py jp VIDEO_ID --mode sentences +``` + +#### Keep Spanish casing and stopwords +```bash +./yt-transcript.py es VIDEO_ID --raw --no-stopwords +``` + +### Output formats +#### Vocabulary mode +``` +palabra: count +``` +Example: +``` +comer: 12 +hablar: 9 +``` + +#### Sentence mode +``` +[12.34s] sentence text here +``` +Example: +``` +[45.67s] 今日はいい天気ですね +``` + +### Language Notes +#### Spanish +- Simple regex-based tokenizer +- Accented characters supported +- Lowercased by default + +#### Japanese +- Uses fugashi (MeCab) +- Outputs surface forms +- Filters via stopword list only (no POS filtering) + +# License diff --git a/audio_extractor.py b/audio_extractor.py new file mode 100755 index 0000000..cca3e87 --- /dev/null +++ b/audio_extractor.py @@ -0,0 +1,254 @@ +#!/usr/bin/env python3 +""" +audio_extractor.py + +Extract all Anki media referenced by [sound:...] tags from one or more decks (grouped by language), +copy them into a language-specific output folder, write an .m3u playlist, and optionally concatenate +all audio into a single MP3 file. + +Howto: + ./audio_extractor.py jp [--concat] [--outdir DIR] [--copy-only-new] + ./audio_extractor.py es [--concat] [--outdir DIR] [--copy-only-new] + +Requirements: + - Anki running + AnkiConnect enabled at http://localhost:8765 + - Python package: requests + - OPTIONAL (for --concat): ffmpeg + +Notes: + - This scans all fields of each note and extracts filenames inside [sound:...] + - It copies referenced media files out of Anki's collection.media folder + - It preserves filenames (and subfolders if they exist) +""" + +import os +import re +import sys +import argparse +import shutil +import subprocess +import tempfile +from typing import Dict, List + +import requests + + +# Map deck name -> language bucket +deck_to_language: Dict[str, str] = { + "日本語": "japanese", + "Español": "spanish", + # Add more mappings here +} + +# Map CLI lang code -> language bucket +lang_map: Dict[str, str] = { + "jp": "japanese", + "es": "spanish", +} + +# If Anki is installed as a flatpak, media dir is typically: +media_dir = os.path.expanduser("~/.var/app/net.ankiweb.Anki/data/Anki2/User 1/collection.media") + +# Default export root (can be overridden by --outdir) +output_root = os.path.expanduser("~/Documents/anki-audio") + +AUDIO_EXTS = (".mp3", ".wav", ".ogg", ".m4a", ".flac") + + +def anki_request(action: str, **params): + """Make an AnkiConnect request and return 'result'. Raise on error.""" + resp = requests.post( + "http://localhost:8765", + json={"action": action, "version": 6, "params": params}, + timeout=30, + ) + resp.raise_for_status() + data = resp.json() + if data.get("error") is not None: + raise RuntimeError(f"AnkiConnect error for {action}: {data['error']}") + return data["result"] + + +def ensure_ffmpeg_available() -> None: + """Raise a helpful error if ffmpeg isn't installed.""" + if shutil.which("ffmpeg") is None: + raise RuntimeError("ffmpeg not found in PATH. Install ffmpeg to use --concat.") + + +def build_playlist(out_dir: str, language: str) -> str: + """ + Create an .m3u playlist listing audio files in out_dir (sorted by filename). + Returns the playlist path. + """ + m3u_path = os.path.join(out_dir, f"{language}.m3u") + files = sorted( + f for f in os.listdir(out_dir) + if f.lower().endswith(AUDIO_EXTS) and os.path.isfile(os.path.join(out_dir, f)) + ) + + with open(m3u_path, "w", encoding="utf-8") as fh: + for fname in files: + fh.write(f"{fname}\n") + + return m3u_path + + +def concat_audio_from_m3u(out_dir: str, m3u_path: str, out_path: str) -> None: + """ + Concatenate audio files in the order listed in the .m3u. + Uses ffmpeg concat demuxer and re-encodes to MP3 for reliability. + + Keeps original files untouched. + """ + ensure_ffmpeg_available() + + # Read playlist entries (filenames, one per line) + with open(m3u_path, "r", encoding="utf-8") as fh: + rel_files = [line.strip() for line in fh if line.strip()] + + # Filter to existing audio files + abs_files: List[str] = [] + for rel in rel_files: + p = os.path.join(out_dir, rel) + if os.path.isfile(p) and rel.lower().endswith(AUDIO_EXTS): + abs_files.append(os.path.abspath(p)) + + if not abs_files: + raise RuntimeError("No audio files found to concatenate (playlist is empty?).") + + # ffmpeg concat demuxer expects a file with lines like: file '/abs/path/to/file' + # Use a temp file so we don't leave junk behind if ffmpeg fails. + with tempfile.NamedTemporaryFile("w", delete=False, encoding="utf-8") as tmp: + concat_list_path = tmp.name + for p in abs_files: + # Escape single quotes for ffmpeg concat list + safe = p.replace("'", "'\\''") + tmp.write(f"file '{safe}'\n") + + # Re-encode to MP3 to avoid header/codec mismatches across files + cmd = [ + "ffmpeg", + "-hide_banner", + "-loglevel", "error", + "-f", "concat", + "-safe", "0", + "-i", concat_list_path, + "-c:a", "libmp3lame", + "-q:a", "4", + "-y", + out_path, + ] + + try: + subprocess.run(cmd, check=True) + finally: + try: + os.remove(concat_list_path) + except OSError: + pass + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Extract Anki audio by language." + ) + + # REQUIRED positional language code: jp / es + parser.add_argument( + "lang", + choices=sorted(lang_map.keys()), + help="Language code (jp or es).", + ) + + # Match bash-style flags + parser.add_argument( + "--concat", + action="store_true", + help="Also output a single concatenated MP3 file (in playlist order).", + ) + parser.add_argument( + "--outdir", + help="Output directory. Default: ~/Documents/anki-audio/", + ) + + # Keep your existing useful behavior + parser.add_argument( + "--copy-only-new", + action="store_true", + help="Skip overwriting existing files.", + ) + + args = parser.parse_args() + + language = lang_map[args.lang] + + # Find all decks whose mapped language matches + selected_decks = [deck for deck, lang in deck_to_language.items() if lang == language] + if not selected_decks: + print(f"No decks found for language: {language}", file=sys.stderr) + return 1 + + # Output folder: either user-specified --outdir or default output_root/ + out_dir = os.path.expanduser(args.outdir) if args.outdir else os.path.join(output_root, language) + os.makedirs(out_dir, exist_ok=True) + + # Collect note IDs across selected decks + all_ids: List[int] = [] + for deck in selected_decks: + ids = anki_request("findNotes", query=f'deck:"{deck}"') + all_ids.extend(ids) + + if not all_ids: + print(f"No notes found in decks for language: {language}") + return 0 + + # Fetch notes info (fields contain [sound:...] references) + notes = anki_request("notesInfo", notes=all_ids) + + # Copy referenced audio files into out_dir + copied: List[str] = [] + for note in notes: + fields = note.get("fields", {}) + for field in fields.values(): + val = field.get("value", "") or "" + for match in re.findall(r"\[sound:(.+?)\]", val): + src = os.path.join(media_dir, match) + dst = os.path.join(out_dir, match) + + if not os.path.exists(src): + continue + + # If Anki stored media in subfolders, ensure the subfolder exists in out_dir + dst_parent = os.path.dirname(dst) + if dst_parent: + os.makedirs(dst_parent, exist_ok=True) + + if args.copy_only_new and os.path.exists(dst): + continue + + shutil.copy2(src, dst) + copied.append(match) + + # Create playlist (top-level audio only; if you have subfolders, you can extend this) + m3u_path = build_playlist(out_dir, language) + + print(f"\n✅ Copied {len(copied)} files for {language}") + print(f"🎵 Playlist created at: {m3u_path}") + print(f"📁 Output directory: {out_dir}") + + # Optional: concatenate all audio into one MP3 (order = playlist order) + if args.concat: + concat_out = os.path.join(out_dir, f"{language}_concat.mp3") + try: + concat_audio_from_m3u(out_dir, m3u_path, concat_out) + print(f"🎧 Concatenated file created at: {concat_out}") + except Exception as e: + print(f"❌ Concatenation failed: {e}", file=sys.stderr) + return 1 + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) + diff --git a/batch_anki_import.sh b/batch_anki_import.sh new file mode 100755 index 0000000..ebf52d6 --- /dev/null +++ b/batch_anki_import.sh @@ -0,0 +1,141 @@ +#!/bin/bash + +prog="$(basename "$0")" + +print_help() { + cat <&2 + echo "$prog: error: the following arguments are required: lang" >&2 + exit 2 +} + +arg_error_unknown() { + echo "usage: $prog [-h] {es,jp}" >&2 + echo "$prog: error: unrecognized arguments: $*" >&2 + exit 2 +} + +lang="" + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + print_help + exit 0 + ;; + jp|es) + if [[ -n "$lang" ]]; then + arg_error_unknown "$1" + fi + lang="$1" + shift + ;; + *) + arg_error_unknown "$1" + ;; + esac +done + +[[ -z "$lang" ]] && arg_error_missing_lang + +case "$lang" in + jp) + DECK_NAME="日本語" + LANG_CODE="ja" + TLD="com" + TEMPO="1.35" + SENTENCE_FILE="$HOME/Documents/sentences_jp.txt" + ;; + es) + DECK_NAME="Español" + LANG_CODE="es" + TLD="es" + TEMPO="1.25" + SENTENCE_FILE="$HOME/Documents/sentences_es.txt" + ;; +esac + + +TAGS='["AI-generated", "text-to-speech"]' +count=0 + +# Use a temporary directory to handle processing +TEMP_DIR=$(mktemp -d) + +while IFS= read -r sentence || [[ -n "$sentence" ]]; do + [[ -z "$sentence" ]] && continue + + # Generate unique filenames + BASENAME="tts_$(date +%Y%m%d_%H%M%S)_${lang}_$RANDOM" + # Path for the raw output from gtts + RAW_OUTPUT="$TEMP_DIR/${BASENAME}_original.mp3" + # Path for the sped-up output that goes to Anki + OUTPUT_PATH="$TEMP_DIR/${BASENAME}.mp3" + + echo "🔊 Processing: $sentence" + + # 1. Generate TTS with specific TLD + if gtts-cli "$sentence" --lang "$LANG_CODE" --tld "$TLD" --output "$RAW_OUTPUT"; then + + # 2. Speed up audio using ffmpeg without changing pitch + if ffmpeg -loglevel error -i "$RAW_OUTPUT" -filter:a "atempo=$TEMPO" -y "$OUTPUT_PATH" < /dev/null; then + + # 3. Add to Anki using the sped-up file + result=$(curl -s localhost:8765 -X POST -d "{ + \"action\": \"addNote\", + \"version\": 6, + \"params\": { + \"note\": { + \"deckName\": \"$DECK_NAME\", + \"modelName\": \"Basic\", + \"fields\": { + \"Front\": \"\", + \"Back\": \"$sentence\" + }, + \"options\": { + \"allowDuplicate\": false + }, + \"tags\": $TAGS, + \"audio\": [{ + \"path\": \"$OUTPUT_PATH\", + \"filename\": \"${BASENAME}.mp3\", + \"fields\": [\"Front\"] + }] + } + } + }") + + if [[ "$result" == *'"error": null'* ]]; then + echo "✅ Added card: $sentence" + ((count++)) + else + echo "❌ Failed to add card: $sentence" + echo "$result" + fi + else + echo "❌ Failed to speed up audio for: $sentence" + fi + + # 4. Cleanup + rm -f "$OUTPUT_PATH" "$RAW_OUTPUT" + else + echo "❌ Failed to generate TTS for: $sentence" + fi + +done <"$SENTENCE_FILE" + +# Cleanup temp directory +rm -rf "$TEMP_DIR" + +echo "🎉 Done! Added $count cards to deck \"$DECK_NAME\"." diff --git a/figures/anki_basic_card_jp.png b/figures/anki_basic_card_jp.png new file mode 100644 index 0000000000000000000000000000000000000000..98c3a0f873452f0060cdc9a402bdcae2f4fe7c3b GIT binary patch literal 30872 zcmeFYbyQXD*EYHV0R;q6k!}U0K}iWE4L}g2J4ERQX+%OKrOPd#gfvQ*h~!of=>`Fb zO?UInB zB4Fb(`hWON8+B7alt)xZ@U6OgH+**qxzFvnt(l#h`g3P%bCimsx$ARtH-D?8CnyvX z>b~q>gRk02ji2)U!z^nSBX`>&1+?Sh8nK@Gb9d3VJwVqk- zfjXVOoP(I+W9fNLzFzQSqA#KzJ7)yTh}yDS>!KKIxE7&mn_=xQ~U=+{~8@+^?> zv`HAquiF_$k^i6mkHUgS`+d|@2^$N`!RBW(f9c?Xrzcc*}YYm3K zcqj?yVK74rjT*5h<%m&WNr&krPE(S=q~a z&HVSSw4!fZKT(`obd*TIY*!*5*Yjvr*6KylUYuh17P>gc`>B-~#^d-@tgRk-NnncV zm-O+?V2@L{RQ80(Wu9dFM200*S~th1Uu8J~nU=aVg}6-FMAX@Y3zYfjn_7qGvi|Jd zgat=`{zJ9*a{DH}l@zn)sEm5CPrcD{tESws`}pMwG%VtJY^d%9&P6o4RwQNq@he4# z^CE%ARYWy)yhNK+fkW{ydT7G8GxguBv3cJl<&7)IlKxQT5N+P1c&f{OvNAB?bGvbv z44$;ltA*1!alALOn)hv~f^Q^9(Shhe;J!A|=Ykn*z1MO8x-L*vJyHgi8xhJ8w!bM- zzjH7=_vYJD9^VLozQb9`z#xr;@^Q(6>} z&x-^_deio5!W)hQHDVjvCt5cu7==dKlJzR`?w2YQj`Us5=J&;SG6Xx&tTrO#t2s_u zEM*%|Qj~z7%a-H+x%IKSf7?XsIr)bC7rMgH5+{Y8(~laY>JUQ{F>FdK@(x|vbsLl2 zao4TL6FukR`!$q>ZvT)faCz5FpS|@+U$_KYlE*pTvV5kR9%hKve~*J1am{j2Q+f-7 zVP_H>u;Z_)rI_T%J-ob03=c04t*%xpGsbu;n~K%&YbRhs_0tfup{$8w!#a)?X3d+H zc0E@_hTd%%@G(3jx)wXtj`lw6A(m}EcP9kx#G%{eIhSs0Iwf!x<)mD41cluTqu``Ke z*zFrX%3-lnANZ!)N6KBNU0s@aw1&B{OCC${F0K+;K59R|4@m9}VJlPuX$^O=R*WQd z%8japoS$qf3ZCDtai}r(wRF@k$wLfl)|j%*Y;2_3geWd6xzl#^OPzAR(#UYZ?e(&O z0m+6y4!-^Bi93f&(jS)8)W>?QhFf~Gc`;Qci*cANml&gMIOAvuLum~r30Gs)o!n%S z%zEq0u4PLxwIV&Cf{}Bs=m}|xS6)L!yb^p{L-?{o*#nB+^;*W9(B8ZYybD9C?i5+3HIBL7l_qV_614Xoww*5&f(>8#`Z&-ug}V~mZ1gvj<>4zo9_ zi2T;8gvnZ-NlfL8H%4bA>%}{zJy+am|Ie(Z6O0I{4R%pJD0HIo*L!r@q_8i%1gk?lRPfgy={<-(LRd9A#S zB;xf;m|GzrWZ}$(C8wsZI7u z`Kfv4+PiLrZO0zE<(Kjq&~ok4R8js)1TvzU7|V_ROkRwGC_EP9T6d5Lr^D&$VtDzf zE(=SB{o}-9vxDY>Xw;N+o4;UsXI5Yu0*$25{w9_v%N*bZ)Dk9d}?>`<~iKH zzuK3OcAzM6Pz^^Es^%wDfH)DY?;%95!pkQ?*N1oqlmw|!+%~@#1VwF!&gfRHTz=-Q z-9HyPc4GGPwOV9P;>1g8kJO#nTLi&O^W3OKr>eBnEq=8}#Vx6SL+g236$PtoTaoyd z!5LR{_W_)$cFeS+Vvw}xfK09=Y?tYQ8BNcC5-g9uZ#iBmwI!|P=t|h+v<>4)1ZHgx zQ%Mr!xFN*0Yfh6OsI7@r-n*ZwG4mE2}iTtvvF~w=AipNZ$veLQ_4w z&ppp#0E;=edN@fwEAsqf1rSok^bcK0r$Ksrh3 zxI3|hvf%d11(yVwkhn`l;s%7WSE?az_GvLqVF z=lx@(R9^|LXezLjY`KxFm3`kfZ@^4n$wQxq^B_?&0mFvG(S}!xVfhvI=0wjO3)>8O zs@29WYdujS`*$9J$n;1o0;KDNV4<*mop{XZ-nrRxumEa;GM>EK8%G z>v3bt&NfkNC(Ns1IieP)7sK*s)DC-2BQ*uf3Y2{ns|vxhMTNH?jJFV9Q@`8)+ULS7 zPF!#0!;~$4!CaLaSVzy1I|sjH$ez^adAAyTe3(#mlZ~xfLss^?;IJgveF#s==;-M5 zkf>+_TItcL&vK-cmxNV+KHk2JhEIHVK#|XtEBk`fI#U-M&9wxh51kj!Qy&MD#y)4v zO3sMPg?&>Wyv3bY<)u4uYY1P95;gGZb&T%lU}g71{t|~A^%9e^%Du|Bs;a6UA^;It zs;wOz9XEK)4BGBJd^nQkz&x$^pku{NLGpO((VwtF$@c28Yeal{ZMzC@nTSd2(qaMLyZ8aC~!hx|e1qSNJ%l$$HXeBOmqE?;9wg-O+ ze+k(3fhqhT*Vlj_g@f>4sb)ffwYG+=p0!@V0}4)S{RW?7Y8mC12`-{p(jV0#bJ+&o z=oNQ~C6j*Wy!E$$t+mgzl1@5bl4n4|(q?#k_*%NGv`KSloMq^L5T}M74<(02z7}h` zZ2N{%@>p;8jcKa>zB4lD)}}$OLtVA4*7F?u0f!G5NPHIvRnge3W31EuVChrgc1jW4 zJKqIoJI2&K_32=STG$GItD_{^?7>1n;HX2>)% zO)%r=JBX2u7@=3GTDsLv*Yfn&i%4Q;R~Bv zNEzi{U%VETlA_C0`)M{O=Q?a16DuoTito^?;WF%dk8m2t1o%w$?v!H{%jRa7xyYrw z4dp^AwK@*_MPrShO2)nBIVqOPN6Wh%Lf`R|#P}Tbi?D)2b{PkXF*Fs#C5wS>UMC+P zB(%J~#I90qt^K{ZnVE-&6xrR^ukR`3`Z<`X?q*$tozA zu8vR4u7Fd8yyRw)8c1}E{M5+C-tKa#hC%J;7ly};;;dL=-*>4CcQ652U9-gOhkaKK ztqHHNu(FafGDeP$>g`f%mn}I~snn>d(b>nfc3Tra$uuRWpa_J8&iwt`k%qsN8Tl6v zML<$%NDAB89j9bY^RPy~|99t?4)HU~`T4aP8sT{i35*sh#|!6ac<=ID-3qvBm4(N1 zxa_fY|1?|{jm0eH=NG_$-&c%H4afwO(nm!_UHC%g z>AltM<+azXu(y^OIqDxD@@*h9;FNHr!Y;v_rCR#KhYunrw8h?suJBu1>7_Uu^}_C( zku(WO_qsF=s)yx^7WN+$Jj~LGwkL7CYqC27$Q8R4aGflNMtFOXH__l`T@-$*_=}5& zyQ>Nw9%7}Xz_Po$yF0?| z2}O{0#xJenNyhoX+?!%<3!y6#9#bXEk*uQoxY_5ksI8#{0+3p&_B6($3De9>j43<`vb`XzIIAbfTpq&s%9` zF7;_<+SYjt)-F%9wYA+*RGhMFxF*9z9@x)apkq8*SyJh~e$nS--=bFR`NhJza)H8z z6OSRC>jk#YuRWBV$&8J?Y-VnrZrl{)*w(bOJVJiyQh=DtVjFVkbhUaZO3by^*48p&cxXkG0>~Za43l#rs5vqsPIt zOW_zOWkW@E%#=i$zr-J&dp}5RD6fCkj>~c=udVIg%*+hpQmv7U3bqFC4h#J82uY1j z>%9jneIx{K-W+2*`zFl!N1HquDP73Vl#0QiK|%tONtaSt<|~wxZ$@mb0xbJJ{gG-o zT~Gh1X_)pZj#Kkb-~E{NgqoVELq?xF%F3iqf2Ia{9&AHw*fl!XT4?L&`1T{|CjWZh zeTKtSQ7nBJnhyu{OKPELZ@1`_;AC&bd0O%c&kGp5)`ClcMNEr^Mh5V|j zs^_kHu$)@_BJETvInjkNW+#(pvZ z!X)Lzqyu$clDbfxfmSp0Xx2UbLMAH{u?rhUxykj`Is@#-P5PGlb&$ND%F3R}$jls5 z3m`aeuQ1{N%t_%kow)l2-E!;3{sP_T;ff-|$g)3=9~2_D(#V7klq}#!3w?XTI2cI#}(JawwCD?JtyB9v(1mIyjH= zqrdISA}L8)X4!Yq^Xh?9PuJ6AX>;aIOOL3=#{^wQDu;BpS5~QR+v>e%z7nLDX~E$A zQt9biphI;A`Da+$#3Zy|OqvxvzfE;urXNvbZE#C4$>(@&j+!{a8dBYU8rE8tFFbC~TJ3@js;m@|m6J1W4mI0qSe1lEOAtnq(ool#T?;jb28(T8sBnNR#P$24;zKPP6`O7vg_qyeq`ajPvA^ zhTJSiAGz*FX3~??fZdAJrp)l_bj4%qTeV^rYczQqlNo)#+cg{`OZyVR%j`CLlfvxt z=g-t8uS4(U#dGu9j%sPU=INAu@)YIeCCgSuw|p2dP%!(>c)B(7S34v!93SPpICze; zLH9K&{qS_nYQ0SG&NA_lBlb&VNXXe+R=>~ao*diJDLs^F!WJoDV8~hDiLX_gbL*?z z4H*vyIvEaGtKZJ2JNCZe#r5O&IUmfH(_4{^wL_X}28^2@w z+2@d3WaeYY5wDW{YpbVydePX%4?b*Zk0NIuMx7UYVYCdqu%~+Ls;`X0WynWQer997Nj)9Pj1|}QMp;@>gX0R!?;*7 z-I5-?)6Yh(FL^N31&fKjp+=Lb5dQ7F@aSYWA!>Pe?6v4>S7V83Yw^=%o!b4sw}$?> zF#b#tZd&Y1G4g;AX*yZv1*sWDl2yRaF4sT(#~?K=>mz2EupVpuhXcc z2-&=Bh@^c3$ys#fW{HKKwnJFMC-62YRoJRw`` ztFk*uq_w$)Mc5c|PEKqD7v|ekQ*gA?&Y1@ay4NpJ`qO;ORQ+sy*)k#gTg=aCdk-uJ zos`cHZ+aVb^~)O@y~5x3R@-qQP&pi90*@~*S z<6{?Z!-GVF-M}L6yK0y3_OIMeB8|P&L(Dv|%(nmSz)QY=)^TcS=(ASU(nEF3gXW)c z(<*h&mp>&$$d~wh80XqSUTdlQ>0E=zm!`J;d<__ z$ZbCT+59e~P;YVOQKgE7w~~~TpM|GVQmgaDN84gHoStS4$`kUXd4y@aWHSY=wt5T~ zdh{5?sX9{^V-_mTi&l}KC)Kc)3~>{bW#TVpIcg?8DW{3OJ~w?Dp~0=oA}mZPTwxvf zUB2Sae3f48>o;G=x2$7B&an6z20Y0bmqg)jHu0g;%? zGdIr@!NP-B_P>@x9q7Cpnt4fo4?Z;XakZ-de9kk}M`1U8=WJ1QUqEhWVc}qS3T{z8 zrk&@4!i-&Bi;bMTJn&N7cynekF&chV)ttuyxa&!+C03hP)T^uME^->Y=?#%uv5ty~ z2>|P_)GNewV_-tmRvd|moUhM2BsrWr_@Agbl|7=qsw9{;R;e#>U7Et|nv2m)wLyYJ zXnr1E5uxgnm>81J8%qU5W%Ac9T~w*}dO;^*k8)Y;ZG=@_*upyV3xIOAq|MrsUbEmXLRg9%x-*%MMV)S$=`%EZDTXGJb zMFyjK-BX}&z|D1np&py6vN*Uaqf3q|dr zArbdU)M9TzaD0JI#~5rZIlk_zCT*=}#+UY;BxXL{FIqSb{q)dvo8?jm6DQeA`npH- zvUP=BvsYVZ_0YY*{8IN8@B+;DBVj#i{YG10)WaXMrOLj{ zSG^TiRh_oQ^@SP{o&P#r`;nz^5-~6%7AlD`Pc8T*h*5H_Y>1L_x3{HuuSWc?D9Wu~ zu+Hav)rE#V$hn>)&iXm>2Ht{85?T>6{?~En`-pq`jC+^ zijP$KXZ)iu8}Bg=f(e*HleHs6>L8No5c78~)89 zspFPw%s6X;%R%LbftgA@-p2{7ZOJ0^OiymsopscYNIICo@BU{29XC$rr#&Kb4F&o_ z)BapEGj{r_NT>-#0^$z?ifRrXC&*PNzn#KCNL2qBF)Ckl!|}M3vx8|46Eonch{>uN zd&6pYwnyNfam4K3<4);UaxrWJC0dg>*N8yd(8qmVY*qU_(*AetOsvaRwIq+H+0QRFCF*nm z`DSkRN`(E+Yf6Iud8)|U7YaNe3>FqT(e&&OQ#!+14-U+{7kUh@*HlXuE*>CjxLjzK z#Xn2uJLfo*>#B`^yKRy1C+(2MaQ}*5hvbtU!?RUf1AYHiA#&54zK+K#tE!Ms@Y3(&x1WIpnH@~IDw@cj>pLI2`2dQvO^IhM0rV|CC^o!_y4`slfG=db^ zPqv1(hwGRewS94xcMu8Yva)(1zmnzXYkh;aFhGywOSn>qYUT|S_MrOpQna@tQmeT> znj>zSV70Un)iBide5v%h^4)%CK9?;1n&h50S6dNF`xf`kZ7cc4T%x5|wnKIaCM)M< zzSN^}v_aa~n=BB7|1(V6g}l!g1?7?XrSgjDM|SC;u%F%E{J!LYCz|L;Tcs>}`4&RK zEG;=A7^Nb9{7^*QL8Onw#1NoY%>4YSw$ifdR2)5R`K(r!V?2aOmM4xl#b*4mp#p6z z_Xy7k@y6HNdPL8Nk*lg$4}UqAr&-jpzd5IxE{)%tuYCzvtgN&&F6zIiUTc<8N>qG& z(A#SdGi*l6pN-d6jZ%=SGHp#RUN%uWDj2!3ap4W-g4yvb_OA0{jH&D0gpf8~OagCf zOE_(JrGqI6&8@Q_FvfGsFDxu*m041p?05NQ$i2BT-NIcW0YdxNg&Qth?MVoC*e(6f>Nn=e!jIdbO}BR z!pwf#Tb~A8NIE1hJHuHcZmJoaP*Utr)!7zwF!_SGFpHmsrRWN`&a2*Db@9`q&5%%z zBdK&CM&I?_#saBtJ~e8Mpr0SBsX(R^sK29H8%`*_yjq&pSeI&H2HSohRA0-aJY8@7 zNOnfZbH^Ht!f0=8>h8Ic2LSJa$QUCaWB+(3p-Xy+Ty#C&9PZ~(KitO5l9ZnPeRs{A zk)%LY{W^d=`Jc&x-JdNUMIIckHbgEhSx4%;_t;r_oh)c2@Z+K?pNKi5?<0~cx(T=2 z2x547U-3%F$}%7`-NN)woN8C3v~Hx%i#mIY15+d9iU7 z4ms#_AAQ;g6ES-;flLYZ$_Et|;x%iPwO+<+eK~=NmT0m0K&{S5R{e|Q>5-H8`u9tv zOHZ{=kHcgD9vOHnT&rFCMjS|*{z!cN3~~je(hWJj&yPwvA2;*21ugmonxJL3*vq~4 z9lux){az2yhsT$`+Qtr;&i_hJ6fh@>puf!)?0pfKL_D{y$^OQyBRE#?x4&ja4kV=y zu9IerTJY^E$@&V=3fs!Up_vZ_+-Tzfnj`c0@x{MCQ-Xn6h%P;WZ%Q1d<>C3i7qr=^ zs@SLwuM0Gbwk>{Je1B1OZ{*AKOEki^Tp0ZMge!@FO{SX2q-0@)`jZ1IxP7`MyySTg z8*)27D?9$-jB)hY1SaC9q~2ojN!iN`8bt=-w^wV`;)(6WSo3ZJm>PGVJ}-W3JnX!SIj7=4Q9dF($l1T$Fqylwq4~lh5^dUhtmYAX38566zV%x z`u+a+aQbL}6K+wb+HibT{PXb?m6_5^k`7CTAH9?h9R-JalsT(mSK9+@e?e0ev&>ox zt=;7J*OwdzWdeyP?2q;wknIkB7k6^&gzK)x+s2(o+Zp7l&pu{lWdJQJwVqj})?UCC zHORw4C6Cr*M!fsL-N6pvnPx0qQ?l*o=yeePxb`Z#TZm#nFJC-&H{1CmKBS5wP=TVOqL?uFgK2lqtLx~{ zWGN=SWPj(5iybI(8@P1wVpLMnLia=2*wsh$>UG9Z*I0;DiaXmLG#u~H_UCH{O!%Ja zgMJy^a)Pyl;+x8~2eEhL<%75=F91+M{VKQ7y?X6hy6sr?iVjF94`Q!Rbt}lql9bBI z;_rDVS@LzY30d`%+`oVSS$}>K-Nt^hK2+S!2XK)`s}$%6qU)77sVvN3|K5D~aP^DL zhz4oZ@>mU}Qu3{6bGVQJxe$q?U%uSZ=6TO&VaBOh7zUR*FAqN*DAqrcu;GZxiD!jU;ZRdBT>kP#=w0l&`-u1^fw{vK|df%?Z8wz zT9L()9_%c?K_VoCJI9=_)I(sk>At5Ym9z1>;*0)?{H6qO1t&&U_5*U9j=$dD#Y4i& z8%kazR7hR&IgN{;S5#P87l}E^1Y`_hb)LH>>wlV8bin$5Egb?4t%8GF+hK!|}A2pGYPHo7Mp1ELcWf^*ffhka077eqxx6UAIOfc{XKybd|~ zY6aVVgba^N`x3jdEJPC0RTxl z?&~HWva>Nlx=3<^$G1c>l3_~uJhq+%X|ZIW*1$(p{eX{+scEyYbOu!P5Uu{h+G- z&(zPRl*|OF-&#pl*53s2s|=tsK~GpRSiT_(gec0z57P%GpoM%5$h-JE6VziNK)t?; zFqe>T7vV7ow7N!spgtC&hlrg6`iVyj^Z%X!R7>v!3Y&aC`wxb4jzaL_d8mGmq?Gxq zR1Pw?^wx*O$lU*5xw!waz2ClCpuK}YPlu9^45?_f^TI_To8cMN5pY%nUDVc!M~b1$ zEG(uWQGH+N$(~h--s|!`2@rMq+g92oz3%1Y9?;hO3{m0ENqq)vB2550gP>bcd zpt}CqxQXCLiZEGE_M-%u2(atO0YhJUtARqf&A&f!P}nt4;|7PdNJp-?8>Qk(!^X;GGxz*lAs0~e5FiG zOzd}-w2wfi|DIsz^BTP3z4cJ(vtRFV9zT9uY&RiUMU66=?TB;V8M0VUXgF%lWLTOb zMooC{(u=t+g;Y7s{YejIz(rY)l%L<6>uL?7zG+rtcyhezYq~Yxjm&Ad+=esQ2Fao| zZmT;vN=X9v;WUEMpb<(eE?N{k{utWID-DIb%7v^HaSxoyul{)Mi@Di+?rTP}vPZjm ztsq6N!Ok1Ocnrt&dy@fVM%q^x+$V7&y)Tke#6P@3qA1}i_+%gfs_wbI-7F*+9#piB z;fJ5q$t)WIFFqtVFE%=c=tm|#9_Pq5A{&F~+w$@q%nj!mCFvkGGzAhh!Q~+fbn{GmOo`{NZEvoc0x919lsrC2rFqj4w^#0#3qn1PH)S*|T05t)ja1>;h83GR^t&*{BhB)c5qbbSVK+fYIuBU1=Bs>i}M9 zHTXz$#NNO|-MfGPZiVd_ven|-y=sn^U`Cl51v;fm8o)c27c*~f=o0;F8j{Du;_*6iuiKLDphoc@vkMt+ra z)8fwowkRRx(!P0f#YDPVuxkKn2DgnMiZx5G$+0kTg9K~Yfp7F2%g3cF6q8Ye;cp($ z!GWx)nHm;GGU_(reciBL9F)p8$f+qPG=Vq@kPYf}ES>Kp-ay`X54gyErD|@5Ck$W^ zzeOhUPK{4wT^0TAEF*p`n=rXH( z*jK~>fH&cxY#WYUy>`dk_Eu_ERmv=%fQ&ZTd3*Ky&}WN(z6zi*RnQNOj3h+4T@X;Uu#Aa`>8|(o6ooF)IlO%fZP>U_AWPM)58}BNNXyPnZPr1Q zL8!&bSWS(L^?M}scpvTnfh33Vin%b}^1F!qTkU)UsZ$*v?wX#S97B6XQ%4-VZa^QpP#~ zgf&5!^yo$6k)iJ~1t6O38#8}rXQ?BTzC8U&6X|;sau*b-{eZBt@bGkcr`8{BJS5Q0 zU^0sO$f*n%2TYIvvvGnk0D58G{;d7&H7WtKvk`Qne|$XMc2~xba{k8$aeQ{;;#^wA zD0o7dH}Ob2#F-9~ML0M*3Y6PTM8HBlx0b=91uc8+i{@r$nQz|gVZ9o> zxX+_oP6A+JZ+{NGJ`?k(8VeJ^-{^`R3EXCu339Lf*CH+>f$+C zB`k8kQ5Mk>l01=Kuv=e z&6$L#`n^BIyXJTkUYqA>Zatf17y*JJ=C%T{^y%fZzfiR&FQA8HYp=P<(6lA2apLsE zO`8A}%c)5U5K<20hhpDTAJkWyim@Q11Oy-vO7$)cE!Ana-TrKLNx3mfu_N|9hxUzt zW9!3CG~wV8yocSqu3OGkSiQ(`6DSL3Oagju8pn;zw&ugEelvU`f1g)AMUnO_ITuV?e0rd2xz2e?R19=Hz@BUjk7P zwy-6$90<3Wnc3v0yh3S2Zh>caNuAQ6_-ux6efap18A^>39?(hppR`L4^u^MfUO842fvIzkn9nG+xilm^Bmz zv$0r^lse0WhJ}5HzNP@H!YX;veRJ?d_k)FOhXH*zt-QT!w%(Np+T|o)Vn9;6tGKQ79Y@eL>%k;>DjYz)3f4#435^gNQ< z7Llnm$6NtE(l{;E2Qro-T9TeqBOkHkb-+B8twMI=VNiXn8gK#oSee}&_v~7?8PzoO zF8i8P1T7$wzNfps5|(U;G9`I1&pJI)jIy6;BZriA*&TD387b*F@pJN9ATb2iMt~wE z&I^yBU4WSPj%2kj};g z_4(ed0xI^AC#t7bUa_+PTJdUE8kz@6Wi~t8K+8m)09~a$Xi?ID@;KCTi+S6 zo36$d$^hySZ*iV(jdU|iSTcixEB?t5!nFWDTmxh@Cs_{jP=JFNm4MKSbTaV!o*d!2 zg|5kIXlVG)UEx9`ND#ZqZO0a!LIPcf-uUPWvily6Ohl!oh9PYmkED*j&qnbinLzXBd$J=B3rQdne@7?|gosJVVafiA7(M|71ShvA)d827dHY5y>9C##Kq0J z*Ezq3rv(E(qr@X1Svxykuaa;Xz+ZKdsS6B3i5Lm!t~)o{c(hB;fcq;{InE+%zjkNX z%Dzgk^5g*9U26U?(#0Qdc^5I zCpbM03UNemE;${YR}5Iw0U{O?U^;KQ9?u}Fi>;Gov7m6==n4=?~PqQ4+DIrv+dt=6ncDkkzZ?iusC z+z(Gv!5`vnIJyYruxUC{iYK`Cal$LW<^9;M$RkYj0px2?R!jhoMl|jN;%F&0Zi)iA z1yl@#J;E}Y;Cnu&zlwUb?&QA5Si^My;-1Gfin5&`EZ}~DCP9R;;7caPq^6QkbvoWy zq-g>?-Nu|5DRn}H3_#3V>j!$95u^ls$)VLEZr$?9Xa$DGa+?wR9wp%^$o1cVr(7Qq z&xtZSh>DxxZabM~JVk8;jKRuA2QG8QuqjDHQ3FJuQPaf(qXf*8Ns?k=fAI;(CD1&? z4XRiGouls>oUC?B&6QF2>2Q2WuQ({hgBeVLT{pKyv+^5%JF^>n&Z_TIAhM^3kOa`; zo!7>F5?Y}jSlwI0Z*YE@wU9LHl!u(B&7U1O;^2BCb4$+z0|4a=>m8GQDP-Rws{TSN{V zCd1BEd?j2O_gs}=It>T}v|89Yh;-e8nvBc56Q^mgE<#I9OZ$f7$*0SE1ftt_D*;sO z-ahlV$e_;pR2d3_z-cH!GWdLcS-Cgq1llH%dV}eY(tO;8_qcap@5!mDUxT-me=X~w zr&D)hEhb!@%NMwECgCr<*LMC2WQcHi>`Nx zx1PV9=B^$0+Em)reiQ@s54?TDZp-S%x(#6M>A``&3C>hFVHJ55v9KVe7w2# z-k&$Tqo%@R&M~drr9fNPehN>ZhOeCsJ$kPOtyDy#eW-nAFYp`5hBr*JGc#00aXk62 zUY$WG-O$jG523kB_%_saW-cHyHW(>IrG+&#o2xeXc<-X^^nr0jL+*s`l#%gG4PpDq zJ7IL97m=iC6D0vS8Dg>Nq1`!h{Mz_A2Eil|AmC!1X0X>;rBH+5k!mt z@zU7y^ClShG%O#fiqsnTR)Ijm1)(2u^ZUak)kp>jdu&0435@DK6Rkkxbt&#`RMcb- z_e)^4bMCrVCA@O{1cAKwetLpAMjwIgI)G$54Hz%Sut5@ObG1lYt(+#!*RScv@}38z zn$|i}14oD?7NS~jE4O8f>+g6sx{^~mQ}c>-KR%#9zhvKNm{RhNq1-T>pfqsD;jP=c z2rNGsR>R2m>S0W}Ot1;4?S~474SVIbHatmHV2rm#jSwQ;Z z-n(PCyTkPSCa+rcZ#jbvI}=ubSJButd<53|b$Yz2jchpHoI<4B6;a${V`FhE#BWwX zx{C%JT-0!apo93P2n1NeB3VU6sUggl*7&SV-t_|Jl>@5SdO|a7unC}Y&^a<%(H|k2 z{(-dZGNH`P&3ztlR4d%`5+FqeLQ?BVK6-2~t~78~0}ZDBx8VuRH`KB!8#7d+A?5@@C&`zFhdCKSdCjN;I;6(mQZiNKk4ZOEJ#1FwwS8gm`7p%SvDVte$?`p_SpVq4wL%5*$&dd!Hrd{*xh zPvD>ZpkV;+_9HeGRjV2fU}H?D@AAe$m~8gGqEq*go}NCuV%&om>KrYg$0E90og z(nRbgXI^r&Ass;s+|zVmm>LCIMu(&>aIbMVoqxVS)sFd+&H!0}MK`u&#U2x*{{ zkU@i>TvW4*g96QAzB`i;Ay+^Zk2Wy8^W$~5VV8q-ZL69y6GFVE1r*7HZU{(GDd2<< zT*y=^LH$8}$GeWhgmdRU>f=VX0IIMP-GOtR>^-hu2dWTXh2qY6)BsM-`S(121SlGu zO7c55QNlRS@Fz7;6?m1zcN9?iIL>&#i0??D?xP4$RA(oD; zE{(4O<(Mrt*Nzu-_#;c~=W@7Qp$mQz+hhRhNI&l6KsiXrY4~4pjgCPAg-F*AKFN*h z2h=g&eF)rK8p6CS5R2+yfTp2P2sPf#|MK+Qoi+1N=)SP zpdP9L2oVXo*qAQOGymNS5J+iItjCiO7aMCvFYc~lYmx`Tyaqgw{&duCu(**!voIC^ z>%r3Ju9@Njph>!@d^m1hpd$%Q5pQE+xMxl!#5{NMx1q401)yI8%-Sr^uovxnJf|u( z2dK+Zdl-!fW0tx){UBXFcUc)7%G?23qVq^{|ARt!TPc^-i;V}IQeEPx_fmeI8=K0) zDE$qmU;3!wzjK-K`!8~D_|c+n{peD2*}SQ79@U3ZZ|^dq*tn@J%AFn8y`kHFq^QXv zHQhXGAt`|;Z?(_&JBpE&MD%9PKu;U}&9=u{v_jU2_+P6*|1N+qssjS`7f>tA3m`ln z3^bgWdvCO}_d`lANHFkdUj(l41oBQ5aRn$IT5uZ0P=3gVhFV>qkIs?s76?+U{S1aM zM@kABN7?V+zr}abL5Qh%Qemr4>?a6&_P{dVrp7|Xh>m) ze*lV=L@`c6Jb7w0{N;H+RGjmnZ)8a4E69vCIK`kfKbMn}(}v944bA8Y>a(}Dwf(v~XZ`}g`=j6~I3pK}a z{fCN*-w^}_)ebMvrx48&QT;)LJi3CyP(YEMW#39>T}l46klcc74iWmc4kg&_47 zIH0HgQZbOA8@mBEupJEaWHhw%C%Qa1uyrIYZu4DX1n zknP5DJRt&Y?+(a6bappq&43na)q5eu*q5}nm3K)!5gzcjGwluhg##1jvA@m4jw}G- zVW7MigLI28X&rDxR*YyUkiVcZeFwxMs2W)vy?-5ph>H-rn_U8jJ5-ZMp{PTCH?*)!__^V6RhFsH%M?L zpxvd4xex&v4T+3Y@bVHzf`$4g^>av0tdko8wWS24YdBVXgi1r(QUG{9-o9@=i0|R> z4#bD7ETkg0w~&n{8ok2B$w>_TFpU7%e}RsG)a0N6qDbNEV>j-o3WGwdW)<*gr6PwJEQDsq|c-W7^&LZ{tu~D}j=w_`!T;xdnIH(ntl_ zZPz7x5!+T95WNRSMC^s?{2(GZVRSg3 z8My{$oe6sYV!Eln8-CU!bOcgkjS2(L&oadKjIWI~CraNR*ru~neb z&gz7nC*}G3W11T8!atmL{P^)5ybMU*d361YutG(W(!X~&l023yKNa#~v5s%2o*23g zozbl7{MXc>Nk=er8l-@* zi^mbb!($G6bA~@+@7iLHaX8Kl3$D-?$Ax&ZyYF)kPKw7nba_3|4_%(P=)arL@mxNZ zT&>v^bx~H`wqH~iyz%gv**WG{7o_iLOS3*ai~QDckw#k@`R#H1ueOWGZ&9VM#L!su#C6%Gi)~ z-c!hsP?9`SlI*`u^YNM4^IQM6&oR4B@{={sQJ;4D%d@x7Q6GaKIz;{NSuZc}_D#ih zPz>!bT$K>+6mzircgKV197zsK{UH?&c_BtwQBhS(WMa}C(IS%U$^}uF)R(`z;TuBM z4w`zr)!+a5k{-S5-yN{{vdvt1eBnB4N@T#5a`{;CfAbP$^=X;2^74@Tr7Bta=s1(?ntgOYNvd~jQGVzT4#w<4-5NGh2K!5iIz4cO9_94nWzjaocyh? z)BaeIm9{Td{w|}L=T~p`A0NpdJAQb4gATf&fLX`^+;Q@7V4qdc)ukOC9$o`Vkx^gI zfN+XyA3}3`2LkobyQ>U}Mk{a6)bpob`CO6cQ!via;$p(aol}o~`If7ot;Z1!a>!>Pb?`e4zOSkdVW%zma(B)jCnGSVW z6oM`R@D06H0T`^!PY`UTmRm$zCAm7sbAQrmV8CRlYSuB>>{if@lF_q?GY2lKuk5(3 z-dLL9YnJrA^on|j3G^zs^xA}aDCawW?iEjy47#@Kg0C)z1G7?}NuhYH3=gSxT}HZ2 z{%rL;s(8+hR91C#bsheEKbO*hEOvW)dw%FMJKFSrb@tu=RKI`RDwR@Hl8km4*>bEx zDiTpB^B6f1j*)Q`kxKf6tgP(peRS-tls&S}vC7Ci4w01+?$_me-~Yhe9a}?@9&c~d3Y(McFD4}v?ccl;DZx8LQ+BuHJT}n0C z@FQaiYmbCnf9=-JmlPDbmvgK)Y`&{uN0Icivu=ASQyLS8HJvomkG^j>FVDf9mYwal zXch|$g-Lzj8L5TlOE!(Su3z89!NJi6bQJKO5lV#z(o(v;c)zbF*2%A$>LmYQ;k=P^ z>4?SLwZiNAMTJ(anhkFBrch!3eItV%J9PY$iAtF-kL9Xq4Q6O{Ik z&HTMQ0?84~1W9Xd`dzpSg4M+_nBJp&qAR6=Ez~ckWbU0md^=02+|O2nR3LHe7t!GE zezS*(a#xKFyFAZh3d2bh!xG;9)33jcLJn9<=r(*b_8!rtUm+ViNT&;GLfi5`N(P#_ z`GtiE@$p|8gQQ2^U25y-Fda)E0e#~%JMgoSv^u3z<#X!a&|dAo0E0;0+;Q}BgWT@bgcne>2I&p}VKqZrY9iW^31li3+8gKK2~H!Hp|)1ZZAvSy5vvv@?;&Sf zu|&!0TmN0g3<$G!LyjmNJcpg*Sf*i@>Cyni;s-yZk%kR~zm;ZdYU* z24t-^=~nP3sI;=m=RiS<1gS{YBnspasu968)&fgSf;#7&x5s$IqsORoWT4zd3MLXU zOu#_Nre4tjr`ZmV(_AG?w*r?z9Yn}D^a_%%IyzoLeJ>!O*#l5>x{q*_9;#SxoP7x` zR>Me30c&q_D}88QypOrJJg0j_JhQ)e(&=@O-`dB-t`epUvW=14&4rc=NmEkJ)5ifk z^FpF6psMKUuFQSj^>kyY8*%jypdMZUQEi;r@h}0sS4i1_W(dIlxty)FoR#le|Lzn_ zokEHl=qK0>KIj3ELMz}E_R|%E=5)?@Ysnaiujo!k+%^RQ2;lWMdc!QJdj0`$eDP$0 zuL#IG{{3RB`gFl4OxV~TsUo4otR@%J`Z&4GnZeDL)qk|B zS#|BDHj%tWyu3O`Mi-tu)bbyER#0q}Wsb}4C(Y8Y&g{s(k^7fP zq)mo~odDq5IOW%Szl#7U(nXG0>&OSwMo0?i#V~;baX_dX0`zfd;9Po9=Z_!nfa^0K z+k4=6AS)w4nAdSQT$lu8FsZ}o$Ivn4#T8uJHeWT zj2HFydi<;N7#a&VmK)0$2OztxuP<^{$`ufHJ(pqR0smHQxg*Z`DTvUv8wY#ZFD znT3VRA;L*;VV+CBAR%Rgfe!Rur5MqD*yU6=lvX)vpigVI5+g=>B{hbf8b*Yu!J8#>0nz*P0=jdsdp6hP9rfb$X8Gag7o%S z`x@4nwcfIwK~5{2m%})sx+_lX9iG1{ls$A~t6S&kwC9r;2IHJREsQeWukXnYuC_k; z1i3)7dm{2x$oiqK+hKWq?mzi|XSAvyk!{}i59rrw=ynzXan@}kq&IGr{fmtzQ1q0O zHTC{7oZTS(gbnYHx>z7&e8)C)BB&Plc^M?;1(Ec_KA8SX#%mlPQh&a+J~#rxOu83? zo6ALdr)CG7I@O@z_b~MIym~0+4RSslJc|{-$7o*MhCpOwBX~~;49!1iLEeK$~_bz&Y z^x$Eu#@QewN1Fp3GZL8B0-u~wxVY~>W;kAll^->ca9wsj0E+)_NIKDQA-qQurI9Gd zLsCV!43G)tKZ|6+LHq;mzZMdSH}EP{jYBBs&ZTT)?5HOggQagX-IL7KIGOs)CQ(CQe zd{pRXY``y)xYZH z5ajJwk#q!QAw^XiHYuB+1BbJ?Xd<9j!9Mv(D29~wPsw+8A-r1BL9`)7A*aN;FOeaD z_wad0J_q6!ZLXbFqXdww#W-Ae2yqb+FNnSZaT97!se^hIb=06vV!f*8k0HR1Ajb+Z zQxxu#Wl+QvxAhNBXlFqs4p0fhpfbD;=T(Y4|M#>xTABs*DkPs!a(9;n2HbkA9%Q6a zH0>afMcLe7o;$V1n`6eNFsmTBWZwL9YU?n15y`LZK*5TFme>NwEM8TumsNd5*;*pX zo}eom($74KEfi&Up3k!&=>aI8t+biwVxRunhy0!r7z7gHA^F+Le;zA)4c>s^4x!qw zsB%dV6$Zd=3&RuyH+YO+iu>}UYTeL{s%m@Q$y80O3kBuzCrof-q+^?|Mef`6S-vG{5fu;H*Pp;@L!6WG+%hPm)oJ`^1Uy% z7B=XOqT|+2?c(DhVjpu=twj#161S?VzOH=7KD+cnf{v#%yX6}(Onl@ccP-_4ZZ_vV zqqn$shNY^Py-feHHT_n;JI#sDA~FY3oYW&}fQHEgLdG);ut3eU2`*2Nj*jg+`i+3z zJWEqt8$6u#&r-}pW-i1W^;Yf=qpc|WguWR#yr4gYp@B8-`#!vDWoCD=xL-?uvB}Y| z^?Qb&eOxx#bK>cV)GIt16E^3P9;N{-W?&E%${&VQB}h~Y7Wcbq>vzc(JjkjjbGW?qSgjd5qFtavU zfhr?hv}ck;9(CTqS!OHPhI7CC(tk{1=d(qS{i`b5jt1bUI?L1;bFCzw z6$sFXSsOtR6(0{QBZPhX&C%c#z^4u#JSa{wQ$kPUJ|O=mie8BQ-hce9v`M_Z*O-^u zt@47hgLyfN>otqnyMxKu^vlO~m3UpHtrhM#5qCREi)vVO?5bbE7qS}u6~(aFDcpH~ z&E|e16?W}GU6!Q&y%x-|whj5|$p8zqORQdM$MPCYq@_F!n0hHSb zB+}uAhq)K4LbRy&nrPR$CN`1|^EJB^9&|4wRF||9b;B-f<)$1Acs`(ig2(h?HZ|Xf z;f&F(2|@d%1dWWGr^H<&GJKURCXR*F0tP;ID?Glvt?dO=Gte3#(ot8FB(?%{)nic9 zAZb2?bZPbu?08^D>U(202bTfKWjcTw-SB#xMy{8=H@0#d1{w!#)rVmYZm)f@zg>EK zlJht1@Rw5!J8~jto-HpsYUbwBa*6j#%&G(6f^udm58GS%20O|i*}`k*cQ zeS9f<_dZq@&G&O7rq|wj0U!*(qL*|x-OW4aAG@ZiJXQTp6@6ROUUYmi+Ezc0;F^Wc zP9f-6JMK=tx5Y|-BJiVU`W^pw1>B`w^F9K6E-U7W45T^2nW#n)k~Gd;P1F5BQ}-fP zc5ocKvdDC1aO=pl0z)-rcg@;q1GPHhxLCowV8v0n)e}w3E&2$zK69Z7pp75zPfjrs z`Z)D|j_(d9;s&Ldh&H2MSU>Bw9i(C_EJw&mr#2@)8|`kMw%E4wFGe9&=z~N4o3`{n zlsyuxqyoU1!AJl6nu^VZen>kcsH*>Uo~|awFNt8+4@JSXDj%N^SCE(5Fd^h^?b?^w z?f9PVeUyYPmXx~c9&r;v@0wFd%QNzK%Ul*xSeaO8VtJ)s;#cRzC>16nO!1WcOw(+6 z0z|j!NqEJUC=*o_X;WHDQMF#Wp^8^s@cJsu5urSE6&l|Xx6)a+*@j=ieWf=2psT6q z9`lbV5DVTTMXL%%`6Pi7qA#c_io05_U z+4t;Usl17~b2kGDyUu^Ag2)y_o0__~ekX04ov{Y2LYKC=v3KCvlL=zLz?R9@XXDLZ zY>b0XN0usTi_P7qvVMIM_OLuRC6;?Ej_9q~Y5T2b_hIdkMtlp0%FnXf7T2hKfj&Z! zzwwPoI{hB$JeDV#-$bP>6Q@9?_2aqFTjbW>*rNSaBiT#cwZU2TPR z$+q;gBK*YaS3fWxyO=9cV5(f8N0zoRQp{QSU=RU6`hYt@+X#(zKOh!XcN({G+?|xk z%z6b;J7P5g`DYwc&WEG8*f6Y%3Io4xs*FTc{Yo>d30|=**^>9-ino6BC+QuSg z{ppQPS1z|DIay)YbhQd6KYqA)qRinI;XZ}1^SwFo8&4-`)G20^-cY!QMW7$tj6Y8SxtQ&D3Mo$+zh%7vumMK<_6A^vx@FI9>fBK1YhWzFQl*x|3 z*mFfp$^w%dKaW^N<=#o4E-m3Y`kj`vS)8Uh%NUx_%#OWU+I=jBnX~Cx3BMg)vr&GM zgb5)U1-KXOC?=MdPR(o<*OpS<^VLrJE{YYt9`m^-`YtQb$0F7`M(#sGS?w~tgrmWsa%+RluQ9Mv`WSAK*CgKUvo! zB_bOZuh?8Av_@(*c-ua^Jeb_O(XN(=(~-Ma(%KYZ%At2^O~H5dh*}iso=jNuOmO2| zgYI{h77iU7Zp`w-dhgJFj`!3OIhM@7I?IgX+B+-aN|kC|JeCkFBU@)Hufk$&jp>eNy5JK;m;sLPZex}p zDf7Qa0`{7Q2far2r#Bx~J3w?V&rPYk#_zbekG19Ze-gws(a)2wC|SR!W_rB%w;8-Z zhi{cn*2VosZR{``vA?H0BPBxm9_pXFSn)yIfJZH=FC=lmwuuY}Y@YNS@?lhL#orXo zeNbPUE!yF#vXlVrCOk5?)w{Vnb(CgysHfB|kGM&*Cs0YpH6sl0#J)zXrO3LlV35&u zPV?Il(F4e<9H#@^kBLCzkXlP>6*$oLpRu?T-(HFrm|d3$ z7aRXh>E)0;rnNtq4Yxnw7Fh>tW0b?vMbBR5q~m_{7o{l5^muPb9aisWEk{EMQy`Z` z%pG1WK4pPBhukqbTmAwar51cR6SK==lsDRi{G}CZ84Hg`2{DQ^2|JvsZY-Kgacryt zZsADNGq;fF75bP3jA&G;Hs{^T6b!K(KOKoH_;~D1Hi%y3bb^>)T5yx(D zkjjGVR=wDnk@LCwgI;EncbATuVs~U$gIn(#TfPFjLaGf-H(+;W?|>Ko1R|RQRlb=e zVIxmV8w(>yOqEeAfb#a5pF2jPuz$|aP_2!WS<7mG)z{!H3{tBRxu==Q>>tZtntgb2 z#~m4Bt^94J$<+Y7~RdUDfx z#XCaIg7ZNeu4MHCss0yco4kea)Sl2hoQ|Krp;{s)it((v%>Zv$kQ)2Qc@m{yPe6Mv zM!D|RcUG;m=-=_R`*0RPv6%-`@$$*Z%`%!ixQ1K7$CcQe)Vp=Dj&7uYGy?1Z<@6)u ztGzNO`lBrp`i&-RVr=AC_?Aq?9F*7WUq9%@ben5O)rW{B=L&ky=*~N5MP=bfKFV2eT|mrx z#tGU5H~SB<``fUj97~pTwTu=#Zi=H!{4#~~Q=Ze;+@2~V__lZ{ezddxP0;*tfpFrf zG~sZOy;#$9k_+!Hd`tFkd~;;3|LFO`aSC~nRsW}IUg_@{-CrNe|L<@v9I6wc>eD!} zxVV%F6}y%s@jvIpD5aT{WIb8yn0ucq8+CCs>sGHj!pCdGbFW?errTTo&EfGh_F3EF zzZ+{08_C*fJON!R2eKzi8~RM0lZ}i+$!a-gXnndg4jsSW?kn?hG24chC3(oL^t6ky zaMHa_m+WNS3xW`|{ z?@s49x%Td!&g6zd?D)L{SzXF`ByoBRnbd;QhHo=0tLM?mB%4&ctQofvcq=r!q3=j_ z=1Quu(P+-N^JL6${d{Dr9+prNNGlg`z~st3Om$1)8>{yMCvxg(b^MuE5uq&i5)Wix z3Ps*%FV`3ag_euAPiOCzIF-!i#1-OrIE)8*zk}hDZ9V1whSKfUG5m!~Yx~o3Y9+#{ z`AXzkg_eR?TaFX`SS~e&#sXi&VkW)&;!vZEceP$fC}Vox)Xd$X)|?itYjc{=>6kw* zR_uJR8{5!tRQt-`rh=S!C>050_#RI_aYv*`+djqs0wI z8cW3SayBuU)Y6ACg|uRd1NF$>2NTW=t0I>xF>a-YYMS@$3(j*SFm{+tmp#Gwrx40w zvx`cH8fnqOh0brAd_t+5&oRntk)QsA0dhZy(Rd)WF5w)p zeov4}lqj`;DKwbWYUvcC+F8Wf!=LH;Q$&r@wVp&XtRT~8uFuQ$MQyL#ZrTn3`-LO@ zMs20pq)6eMDAjKi?M@gFr~r4wi;GbjWRLQHe;c*jzxz9F8r_*^+V7JROp#dN22*1pEWFnsd6 zCVmEqN4U(AS`F9m6h~s@M`0(bO!#k({+{JyaVKvGIhqDDcd`^J{V2z{YRpp@-i*27 zSDXfWtX`1`>f%55`oTk>+T}eb#i_T)ib#sV`6bx-X9}rLcNCb{NfcI?5R3~b5nyv zBaA!IzbcbTi1|ycB>8@HyDKEIDt<))zk7^6oCJZHQjuvylbFpQ*+)lPNK9nz<0v!z zlFInMEVPf|(%s5(viqA>o$&plZxje}Y7=hwKG!(nf`l`<$CXV1qfpW_+Bx?+n^kgF z?BxR_pdfqBOFSY(6_vJ@Ib9w!#c-Cr)x6Pml3xBeEzT_!ix5x-yd`-5!-wcO#QH84NQ_7#Q#8&1h46$IQhjl`wk^+~y z*68KN&Ryo;G~G+`%jP@Kv^rl%N$q#LD_jh-d$ZE&tX93TAn#E=T=V(mB~4sVc}}hE zO}xxRM!RdEh8M1k$U#5J(C~$H5G4|Q`&{i7$J}J@sdv*`x=z!~raL|7WtNv-`8B-} z*>yCWmo}!!LO$9j)6b~?9X#bs52PJLDW;hSq@uegHX~~7Ol-Q>XMflFS-APL%o6lv zj=??-&a!$PHGz2>*Muy#WblsRfATO;44qK5UI1G$8fjdujdY{hgRTYbeZD|u&rySO z{t^@dGm=SL)+o^D05VHdUYU7c%mcG!u~)~PIs|`klJH%FNuHoaI%n!n)=bk z1B!-{-7;|x@(I@h+nrkWW%|n0`5Tga-N@h;&zz6uRpQU84gJfmXh8F|$*~{9M%LcP z*AZb^Xr@}cmyQyCrV$=@jhi0(dOIB-3)!-OHnZ>UxNVW2|GD~iy;xnyz@>~GhuRDs z-R%!);1ek|DS;_`t$iAJLxP+ZHAC^LpIz1;@=C3cn@gO~>oHG@?^v{wT$TYY`=e-0 z>rPS*LYFM@^g1?ex+9tPp_vaksW9F=UTvy&Z^6=$Bg&WazrzJ`R)%G~Z(jP8`7KlG{4f%1lFq{!_Lh;K z$^)d}ut+kUvvTi}L3bJb zFHK3n>I7GTJ&vuATVUBz;b^JcBOlAaU$*?VvR52|)QmHS9kvY1>3Cs~wtW+Ixt4G{ z!EjsHjE~8>Ewyrj?}{BEN%hs|Kl-6~kASW0&hg^KH2r7Z4w{T4!|nTo^UhK$Il_N; zV2h^`tQgun-(KdibuXhbEF66jWl!5ZST0qnx^Va&XGrIb>}|8DJ5-akn@m3@K-3Qh z)Jx=7tFJ*En~%Gqc74sKF|nI-EiK{5CBi5UX}W$B#oJr$9*lOQYSf$A3OUYSyUDIM zYBA>!k+?5%wrSPr_2~8o^+x+Y?arRqejy>S=^u7{)X!hSw8U1v+a>ZN=9NSJb(Afi zAz%N{i@zhs%NC05t|u4`@b3Ofj&%pKQ0tL-;Bq7D@UJc;5KYSW#!V)4y&=CIOYDdx z^j(s)3wa-J^}RCx5%I#5KC+QQ*`xT&Xdg&x3M{Zbx{SDy4HIFaOqgHgPjvw*P;o`8wp-jHt^@ z1ZU{7ZDPd&e1tQ@ZN|8g;;lj|VpB?=VMf!UtkUw7^_jdL)r#i+MN)eN{hoC?weXVpructL^0qyr#jMrDA~LWC*pYR{3Ke z`b!Sk&iSg41Py^2_eMtYLt+M%wEa<;bluLWguWKT0{Q32SVDyFo9P?2)$jf~avlB* z^iP=bvpOzrHRD+qfYzquT4JBN#W$b50T?vM)wbwutuA^CJ>~!3KR(S{dy7MqPRV^1 TPlR6wrn{x6euH?; language bucket. :contentReference[oaicite:2]{index=2} +LANG_MAP: Dict[str, str] = { + "jp": "japanese", + "es": "spanish", +} + +# Map deck name -> language bucket (same pattern as audio_extractor.py). :contentReference[oaicite:3]{index=3} +DECK_TO_LANGUAGE: Dict[str, str] = { + "日本語": "japanese", + "Español": "spanish", + # Add more deck mappings here +} + +# Default output root (mirrors the “one folder per language” idea) +DEFAULT_OUTPUT_ROOT = os.path.expanduser("~/Documents/anki-words") + + +# ------------------------- +# Logging +# ------------------------- +def setup_logging(logfile: str) -> None: + os.makedirs(os.path.dirname(os.path.abspath(logfile)), exist_ok=True) + logging.basicConfig( + filename=logfile, + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + ) + + +# ------------------------- +# HTML cleanup helpers +# ------------------------- +def extract_first_visible_line(text: str) -> str: + """Remove common HTML and return only the first visible line.""" + text = unescape(text or "") + text = re.sub(r"]*>", "\n", text, flags=re.IGNORECASE) + text = re.sub(r"<[^>]+>", "", text) + text = text.strip() + return text.splitlines()[0] if text else "" + + +def extract_visible_text(text: str) -> str: + """Remove common HTML and return all visible text as a single string.""" + text = unescape(text or "") + text = re.sub(r"]*>", "\n", text, flags=re.IGNORECASE) + text = re.sub(r"<[^>]+>", "", text) + # Normalize whitespace a bit + text = re.sub(r"[ \t]+", " ", text) + text = re.sub(r"\n{2,}", "\n", text) + return text.strip() + + +# ------------------------- +# AnkiConnect helper +# ------------------------- +def anki_request(action: str, **params): + """ + Make an AnkiConnect request and return 'result'. + Raises a helpful error if the HTTP call fails or AnkiConnect returns an error. + """ + resp = requests.post( + "http://localhost:8765", + json={"action": action, "version": 6, "params": params}, + timeout=30, + ) + resp.raise_for_status() + data = resp.json() + if data.get("error") is not None: + raise RuntimeError(f"AnkiConnect error for {action}: {data['error']}") + return data["result"] + + +def get_notes(query: str) -> List[dict]: + """ + Query Anki for notes and return notesInfo payload. + """ + note_ids = anki_request("findNotes", query=query) or [] + if not note_ids: + return [] + return anki_request("notesInfo", notes=note_ids) or [] + + +# ------------------------- +# Language-specific token rules (spaCy-based) +# ------------------------- +JAPANESE_CHAR_RE = re.compile(r"[\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Han}ー]+") + +JAPANESE_PARTICLES = { + "は", "が", "を", "に", "へ", "で", "と", "や", "も", "から", "まで", "より", "ば", "なら", + "の", "ね", "よ", "ぞ", "ぜ", "さ", "わ", "か", "な", "って", "とき", "ってば", "けど", "けれど", + "しかし", "でも", "ながら", "ほど", "し", "もの", "こと", "ところ", "よう", "らしい", "られる", +} + +JAPANESE_GRAMMAR_EXCLUDE = { + "て", "た", "ます", "れる", "てる", "ぬ", "ん", "しまう", "いる", "ない", "なる", "ある", "だ", "です", +} + +JAPANESE_ALLOWED_POS = {"NOUN", "PROPN", "VERB", "ADJ"} + + +def japanese_filter(token) -> bool: + """ + Filter Japanese tokens to keep “content-ish” words and avoid particles/grammar glue. + Assumes a Japanese spaCy model that provides lemma_ and pos_ reasonably. + """ + text = (token.text or "").strip() + lemma = (token.lemma_ or "").strip() + + if not text: + return False + + # Must look like Japanese script (hiragana/katakana/kanji/ー) + if not JAPANESE_CHAR_RE.fullmatch(text): + return False + + # Drop obvious grammar / particles + if lemma in JAPANESE_GRAMMAR_EXCLUDE or text in JAPANESE_PARTICLES: + return False + + # Keep only selected parts of speech + if getattr(token, "pos_", None) not in JAPANESE_ALLOWED_POS: + return False + + # Drop URLs/emails/stopwords when model flags them + if getattr(token, "is_stop", False) or getattr(token, "like_url", False) or getattr(token, "like_email", False): + return False + + # Defensive: drop tokens that look like HTML fragments or garbage + if any(c in text for c in "<>=/\\:&%"): + return False + if text in {"ruby", "rt", "div", "br", "nbsp", "href", "strong", "a"}: + return False + + return True + + +def spanish_filter(token) -> bool: + """ + Keep alpha tokens that are not stopwords. (spaCy handles accent marks fine here.) + """ + return bool(getattr(token, "is_alpha", False)) and not bool(getattr(token, "is_stop", False)) + + +def spanish_format(token) -> str: + return (token.lemma_ or token.text or "").lower().strip() + + +def japanese_format(token) -> str: + # Keep both lemma and surface form (useful when lemma normalization is aggressive) + lemma = (token.lemma_ or "").strip() + surface = (token.text or "").strip() + if not lemma and not surface: + return "" + if lemma and surface and lemma != surface: + return f"{lemma} ({surface})" + return lemma or surface + + +LANGUAGE_PROFILES = { + "spanish": { + "spacy_model": "es_core_news_sm", + "token_filter": spanish_filter, + "output_format": spanish_format, + }, + "japanese": { + "spacy_model": "ja_core_news_lg", + "token_filter": japanese_filter, + "output_format": japanese_format, + }, +} + + +def load_spacy_model(model_name: str): + """ + Import spaCy lazily and load a model. + This lets us show clearer errors when spaCy is missing/broken in the environment. + """ + try: + import spacy # type: ignore + except Exception as e: + raise RuntimeError( + "Failed to import spaCy. If you're on Python 3.14, spaCy may not be compatible yet.\n" + "Use a Python 3.12 venv for this script." + ) from e + + try: + return spacy.load(model_name) + except Exception as e: + raise RuntimeError( + f"Failed to load spaCy model '{model_name}'.\n" + f"Try: python -m spacy download {model_name}" + ) from e + + +# ------------------------- +# Core extraction +# ------------------------- +def extract_counts( + notes: List[dict], + field_name: str, + nlp, + token_filter: Callable, + output_format: Callable, + use_full_field: bool, +) -> Counter: + """ + For each note, take the specified field, strip HTML, tokenize, and count. + """ + counter: Counter = Counter() + + for note in notes: + fields = note.get("fields", {}) or {} + raw_val = (fields.get(field_name, {}) or {}).get("value", "") or "" + + text = extract_visible_text(raw_val) if use_full_field else extract_first_visible_line(raw_val) + if not text: + continue + + doc = nlp(text) + for token in doc: + if token_filter(token): + key = output_format(token) + if key: + counter[key] += 1 + + return counter + + +def write_counts(counter: Counter, out_path: str, min_freq: int) -> int: + """ + Write "token count" lines sorted by descending count. + Returns the number of written entries. + """ + items = [(w, c) for (w, c) in counter.items() if c >= min_freq] + items.sort(key=lambda x: (-x[1], x[0])) + + os.makedirs(os.path.dirname(os.path.abspath(out_path)), exist_ok=True) + with open(out_path, "w", encoding="utf-8") as f: + for word, freq in items: + f.write(f"{word} {freq}\n") + + return len(items) + + +def build_query_from_decks(decks: List[str]) -> str: + """ + Build an Anki query that OR's multiple deck:"..." clauses. + """ + # deck:"日本語" OR deck:"日本語::subdeck" is possible but we keep it simple. + parts = [f'deck:"{d}"' for d in decks] + return " OR ".join(parts) + + +# ------------------------- +# Main CLI +# ------------------------- +def main() -> int: + parser = argparse.ArgumentParser( + description="Extract frequent words from Anki notes (CLI resembles other toolkit scripts)." + ) + + # Match "positional lang” style (jp/es) + parser.add_argument("lang", choices=sorted(LANG_MAP.keys()), help="Language code (jp or es).") + + # Let you override deck selection, but keep sane defaults: + # - if --query is provided, we use that exactly + # - else if --deck is provided (repeatable), we use those decks + # - else we infer decks from DECK_TO_LANGUAGE mapping + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--query", + help='Full Anki search query (e.g. \'deck:"Español" tag:foo\'). Overrides --deck.', + ) + group.add_argument( + "--deck", + action="append", + help='Deck name (repeatable). Example: --deck "日本語" --deck "日本語::Subdeck"', + ) + + # Similar “bashy” knobs + parser.add_argument("--field", default="Back", help="Which note field to read (default: Back).") + parser.add_argument("--min-freq", type=int, default=2, help="Minimum frequency to include (default: 2).") + parser.add_argument("--outdir", help="Output directory (default: ~/Documents/anki-words/).") + parser.add_argument("--out", help="Output file path (default: /words_.txt).") + parser.add_argument( + "--full-field", + action="store_true", + help="Use the full field text (HTML stripped) instead of only the first visible line.", + ) + parser.add_argument( + "--spacy-model", + help="Override the spaCy model name (advanced).", + ) + parser.add_argument( + "--logfile", + default=os.path.expanduser("~/Documents/anki-words/extract_words.log"), + help="Log file path.", + ) + + args = parser.parse_args() + + setup_logging(args.logfile) + + language_bucket = LANG_MAP[args.lang] + profile = LANGUAGE_PROFILES.get(language_bucket) + if not profile: + print(f"❌ Unsupported language bucket: {language_bucket}", file=sys.stderr) + return 1 + + # Resolve query / decks + if args.query: + query = args.query + else: + if args.deck: + decks = args.deck + else: + decks = [d for d, lang in DECK_TO_LANGUAGE.items() if lang == language_bucket] + if not decks: + print(f"❌ No decks mapped for language: {language_bucket}", file=sys.stderr) + return 1 + query = build_query_from_decks(decks) + + # Output paths + out_dir = os.path.expanduser(args.outdir) if args.outdir else os.path.join(DEFAULT_OUTPUT_ROOT, language_bucket) + default_outfile = os.path.join(out_dir, f"words_{args.lang}.txt") + out_path = os.path.expanduser(args.out) if args.out else default_outfile + + logging.info("lang=%s bucket=%s query=%s field=%s", args.lang, language_bucket, query, args.field) + print(f"🔎 Query: {query}") + print(f"🧾 Field: {args.field}") + + # Load spaCy model + model_name = args.spacy_model or profile["spacy_model"] + try: + nlp = load_spacy_model(model_name) + except Exception as e: + print(f"❌ {e}", file=sys.stderr) + logging.exception("spaCy load failed") + return 1 + + # Fetch notes + try: + notes = get_notes(query) + except Exception as e: + print(f"❌ Failed to query AnkiConnect: {e}", file=sys.stderr) + logging.exception("AnkiConnect query failed") + return 1 + + print(f"✅ Found {len(notes)} notes.") + if not notes: + print("⚠️ No notes found. Check your query/deck names.") + return 0 + + # Validate the field exists on at least one note + fields0 = (notes[0].get("fields", {}) or {}) + if args.field not in fields0: + available = list(fields0.keys()) + print(f"❌ Field '{args.field}' not found on sample note.", file=sys.stderr) + print(f" Available fields: {available}", file=sys.stderr) + return 1 + + # Extract + write + counter = extract_counts( + notes=notes, + field_name=args.field, + nlp=nlp, + token_filter=profile["token_filter"], + output_format=profile["output_format"], + use_full_field=args.full_field, + ) + + print(f"🧠 Extracted {len(counter)} unique entries (before min-freq filter).") + written = write_counts(counter, out_path, args.min_freq) + + print(f"📄 Wrote {written} entries to: {out_path}") + logging.info("wrote=%s out=%s", written, out_path) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) + diff --git a/yt-transcript.py b/yt-transcript.py new file mode 100755 index 0000000..1f04675 --- /dev/null +++ b/yt-transcript.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python3 +""" +yt-transcript.py + +Extract vocab or timestamped lines from a YouTube transcript. + +Howto: + ./yt-transcript.py {jp,es} [options] + +Examples: + ./yt-transcript.py es https://youtu.be/SLgVwNulYhc --mode vocab --top 50 + ./yt-transcript.py jp SLgVwNulYhc --mode sentences + +Requirements: + pip install youtube-transcript-api + +Japanese tokenization (recommended "Option 1"): + pip install "fugashi[unidic-lite]" +""" + +from __future__ import annotations + +import re +import sys +import argparse +from collections import Counter +from urllib.parse import urlparse, parse_qs + +from youtube_transcript_api import YouTubeTranscriptApi + + +# ------------------------- +# Language mapping +# ------------------------- +LANG_MAP = { + "jp": "ja", + "es": "es", +} + +# Small starter stopword lists (you can grow these over time) +STOPWORDS = { + "es": { + "de", "la", "que", "el", "en", "y", "a", "los", "del", "se", "las", "por", + "un", "para", "con", "no", "una", "su", "al", "lo", "como", + }, + "en": {"the", "is", "and", "of", "to", "in", "it", "that", "on", "you", "this", "for", "with"}, + "ja": {"の", "に", "は", "を", "た", "が", "で", "て", "です", "ます", "する", "ある", "いる"}, +} + + +# ------------------------- +# URL / transcript helpers +# ------------------------- +def extract_video_id(url_or_id: str) -> str: + """Accept full YouTube URLs (including youtu.be) or raw video IDs.""" + if "youtube" in url_or_id or "youtu.be" in url_or_id: + query = urlparse(url_or_id) + + # youtu.be/ + if query.hostname == "youtu.be": + return query.path.lstrip("/") + + # youtube.com/watch?v= + if query.hostname in ("www.youtube.com", "youtube.com", "m.youtube.com"): + qs = parse_qs(query.query) + v = qs.get("v", []) + if v: + return v[0] + + return url_or_id + + +def fetch_transcript(video_id: str, lang_code: str): + """ + Support both youtube-transcript-api v1.x and older v0.x. + + - v1.x: instance method .fetch(video_id, languages=[...]) -> list of snippet objects + - v0.x: class method .get_transcript(video_id, languages=[...]) -> list of dicts + """ + # Newer API (v1.x) + if hasattr(YouTubeTranscriptApi, "fetch"): + api = YouTubeTranscriptApi() + return api.fetch(video_id, languages=[lang_code]) + + # Older API (v0.x) + if hasattr(YouTubeTranscriptApi, "get_transcript"): + return YouTubeTranscriptApi.get_transcript(video_id, languages=[lang_code]) + + raise RuntimeError("Unsupported youtube-transcript-api version (missing fetch/get_transcript).") + + +def snippet_text(entry) -> str: + """Entry can be a dict (old API) or a snippet object (new API).""" + if isinstance(entry, dict): + return (entry.get("text", "") or "") + return (getattr(entry, "text", "") or "") + + +def snippet_start(entry) -> float: + """Entry can be a dict (old API) or a snippet object (new API).""" + if isinstance(entry, dict): + return float(entry.get("start", 0.0) or 0.0) + return float(getattr(entry, "start", 0.0) or 0.0) + + +# ------------------------- +# Tokenization +# ------------------------- +def tokenize_japanese(text: str) -> list[str]: + """ + Japanese tokenization using fugashi (MeCab wrapper). + Recommended install: pip install "fugashi[unidic-lite]" + """ + try: + from fugashi import Tagger + except ImportError as e: + raise RuntimeError('Japanese requires fugashi. Install: pip install "fugashi[unidic-lite]"') from e + + tagger = Tagger() + return [w.surface for w in tagger(text)] + + +def tokenize_spanish(text: str, raw: bool = False) -> list[str]: + """ + Lightweight Spanish tokenization (keeps accented letters). + If raw=False, lowercases everything. + """ + tokens = re.findall(r"\b[\wáéíóúñü]+\b", text) + return tokens if raw else [t.lower() for t in tokens] + + +def count_words(tokens: list[str], lang_code: str, remove_stopwords: bool = True) -> Counter: + if remove_stopwords: + sw = STOPWORDS.get(lang_code, set()) + tokens = [t for t in tokens if t not in sw] + return Counter(tokens) + + +# ------------------------- +# Main +# ------------------------- +def main() -> int: + parser = argparse.ArgumentParser( + description="Extract vocab or timestamped lines from a YouTube transcript." + ) + parser.add_argument("lang", choices=["jp", "es"], help="Language code (jp or es).") + parser.add_argument("video", help="YouTube video URL or ID") + parser.add_argument( + "--mode", + choices=["vocab", "sentences"], + default="vocab", + help="Mode: vocab (word counts) or sentences (timestamped lines)", + ) + parser.add_argument("--top", type=int, default=None, help="Top N words (vocab mode only)") + parser.add_argument("--no-stopwords", action="store_true", help="Don't remove common words") + parser.add_argument( + "--raw", + action="store_true", + help="(Spanish only) Do not lowercase tokens", + ) + + args = parser.parse_args() + lang_code = LANG_MAP[args.lang] + video_id = extract_video_id(args.video) + + try: + transcript = fetch_transcript(video_id, lang_code) + except Exception as e: + print(f"Error fetching transcript: {e}", file=sys.stderr) + return 1 + + if args.mode == "sentences": + for entry in transcript: + start = snippet_start(entry) + text = snippet_text(entry).replace("\n", " ").strip() + if text: + print(f"[{start:.2f}s] {text}") + return 0 + + # vocab mode + text = " ".join(snippet_text(entry) for entry in transcript).replace("\n", " ") + + if lang_code == "ja": + tokens = tokenize_japanese(text) + else: + tokens = tokenize_spanish(text, raw=args.raw) + + counts = count_words(tokens, lang_code, remove_stopwords=not args.no_stopwords) + items = counts.most_common(args.top) if args.top else counts.most_common() + + for word, count in items: + print(f"{word}: {count}") + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) +