Add shared config and update tool scripts
This commit is contained in:
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
.venv/
|
||||||
|
venv/
|
||||||
31
README.md
31
README.md
@@ -29,10 +29,10 @@ Personally, I like to have one venv that contains all the prerequisites.
|
|||||||
python3.12 -m venv ~/.venv/anki-tools
|
python3.12 -m venv ~/.venv/anki-tools
|
||||||
source ~/.venv/anki-tools/bin/activate
|
source ~/.venv/anki-tools/bin/activate
|
||||||
python3 -m pip install -U pip
|
python3 -m pip install -U pip
|
||||||
pip install gtts jq yq spacy youtube-transcript-api pyyaml genanki fugashi regex requests
|
pip install -r requirements.txt
|
||||||
|
|
||||||
# Also install ffmpeg
|
# Also install system command-line dependencies
|
||||||
sudo dnf install ffmpeg
|
sudo dnf install ffmpeg jq
|
||||||
```
|
```
|
||||||
That way, whenever you want to run these scripts, you can just source the venv and run the appropriate script.
|
That way, whenever you want to run these scripts, you can just source the venv and run the appropriate script.
|
||||||
|
|
||||||
@@ -55,6 +55,17 @@ Most scripts assume:
|
|||||||
- that your anki cards are basic, with audio on the front and the sentence (in the target language) on the back. These tools only look at the first line of the back, so you can have notes/translations/etc. on the following lines if you like.
|
- that your anki cards are basic, with audio on the front and the sentence (in the target language) on the back. These tools only look at the first line of the back, so you can have notes/translations/etc. on the following lines if you like.
|
||||||

|

|
||||||
|
|
||||||
|
### Shared configuration
|
||||||
|
|
||||||
|
Common settings live in `anki_common.py`, including:
|
||||||
|
- the AnkiConnect URL
|
||||||
|
- language code mappings (`jp`, `es`)
|
||||||
|
- deck-to-language mappings
|
||||||
|
- default output directories
|
||||||
|
- the default Anki `collection.media` path used by `audio_extractor.py`
|
||||||
|
|
||||||
|
If you rename your decks, add another language, or use a different default media location, update `anki_common.py` once instead of editing each script separately. Some settings can also be overridden at runtime, such as `audio_extractor.py --media-dir`.
|
||||||
|
|
||||||
### Language support
|
### Language support
|
||||||
- 🇯🇵 日本語
|
- 🇯🇵 日本語
|
||||||
- 🇪🇸 Español
|
- 🇪🇸 Español
|
||||||
@@ -66,15 +77,18 @@ Most scripts assume:
|
|||||||
### Usage:
|
### Usage:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./audio_extractor.py jp [--concat] [--outdir DIR] [--copy-only-new]
|
./audio_extractor.py jp [--concat] [--outdir DIR] [--media-dir DIR] [--copy-only-new]
|
||||||
./audio_extractor.py es [--concat] [--outdir DIR] [--copy-only-new]
|
./audio_extractor.py es [--concat] [--outdir DIR] [--media-dir DIR] [--copy-only-new]
|
||||||
```
|
```
|
||||||
|
|
||||||
Outputs:
|
Outputs:
|
||||||
- Copies audio into `~/Languages/Anki/anki-audio/<language>/` by default
|
- Copies audio into `~/Languages/Anki/anki-audio/<language>/` by default
|
||||||
- Writes `<language>.m3u`
|
- Writes `<language>.m3u`, including audio copied into subfolders
|
||||||
- With `--concat`, writes `<language>_concat.mp3` (keeps individual files)
|
- With `--concat`, writes `<language>_concat.mp3` (keeps individual files)
|
||||||
|
|
||||||
|
Options:
|
||||||
|
- `--media-dir DIR`: override the Anki `collection.media` directory. By default, this uses the common Flatpak path: `~/.var/app/net.ankiweb.Anki/data/Anki2/User 1/collection.media`
|
||||||
|
|
||||||
### Requirements
|
### Requirements
|
||||||
- Anki + AnkiConnect
|
- Anki + AnkiConnect
|
||||||
- `requests`
|
- `requests`
|
||||||
@@ -103,7 +117,7 @@ Outputs:
|
|||||||
|
|
||||||
### Requirements
|
### Requirements
|
||||||
- Anki + AnkiConnect
|
- Anki + AnkiConnect
|
||||||
- `gtts-cli`, `ffmpeg`, `curl`
|
- `gtts-cli`, `ffmpeg`, `curl`, `jq`
|
||||||
|
|
||||||
### Sentence files
|
### Sentence files
|
||||||
- Japanese: `~/Languages/Anki/sentences_jp.txt`
|
- Japanese: `~/Languages/Anki/sentences_jp.txt`
|
||||||
@@ -111,6 +125,7 @@ Outputs:
|
|||||||
|
|
||||||
### Notes
|
### Notes
|
||||||
- Audio files are generated in a temporary directory and cleaned up after import. No local audio files are retained.
|
- Audio files are generated in a temporary directory and cleaned up after import. No local audio files are retained.
|
||||||
|
- Sentences and tags are encoded as JSON with `jq`, so quotes and punctuation in sentence files are handled safely.
|
||||||
|
|
||||||
## word-scraper
|
## word-scraper
|
||||||
|
|
||||||
@@ -304,4 +319,4 @@ Example:
|
|||||||
# License
|
# License
|
||||||
|
|
||||||
This project is licensed under the MIT License.
|
This project is licensed under the MIT License.
|
||||||
See the [`LICENSE`](./LICENSE) file for details.
|
See the [`LICENSE`](./LICENSE) file for details.
|
||||||
|
|||||||
47
anki_common.py
Normal file
47
anki_common.py
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Shared configuration and AnkiConnect helpers for the toolkit scripts."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
from typing import Dict
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
ANKI_CONNECT_URL = "http://localhost:8765"
|
||||||
|
|
||||||
|
LANG_MAP: Dict[str, str] = {
|
||||||
|
"jp": "japanese",
|
||||||
|
"es": "spanish",
|
||||||
|
}
|
||||||
|
|
||||||
|
TRANSCRIPT_LANG_MAP: Dict[str, str] = {
|
||||||
|
"jp": "ja",
|
||||||
|
"es": "es",
|
||||||
|
}
|
||||||
|
|
||||||
|
DECK_TO_LANGUAGE: Dict[str, str] = {
|
||||||
|
"日本語": "japanese",
|
||||||
|
"Español": "spanish",
|
||||||
|
}
|
||||||
|
|
||||||
|
DEFAULT_ANKI_MEDIA_DIR = os.path.expanduser(
|
||||||
|
"~/.var/app/net.ankiweb.Anki/data/Anki2/User 1/collection.media"
|
||||||
|
)
|
||||||
|
|
||||||
|
DEFAULT_AUDIO_OUTPUT_ROOT = os.path.expanduser("~/Languages/Anki/anki-audio")
|
||||||
|
DEFAULT_WORD_OUTPUT_ROOT = os.path.expanduser("~/Languages/Anki/anki-words")
|
||||||
|
|
||||||
|
|
||||||
|
def anki_request(action: str, **params):
|
||||||
|
"""Make an AnkiConnect request and return the result payload."""
|
||||||
|
resp = requests.post(
|
||||||
|
ANKI_CONNECT_URL,
|
||||||
|
json={"action": action, "version": 6, "params": params},
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
data = resp.json()
|
||||||
|
if data.get("error") is not None:
|
||||||
|
raise RuntimeError(f"AnkiConnect error for {action}: {data['error']}")
|
||||||
|
return data["result"]
|
||||||
@@ -28,66 +28,54 @@ import argparse
|
|||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
import tempfile
|
import tempfile
|
||||||
from typing import Dict, List
|
from typing import List
|
||||||
|
|
||||||
import requests
|
from anki_common import (
|
||||||
|
DEFAULT_ANKI_MEDIA_DIR,
|
||||||
|
DEFAULT_AUDIO_OUTPUT_ROOT,
|
||||||
# Map deck name -> language bucket
|
DECK_TO_LANGUAGE,
|
||||||
deck_to_language: Dict[str, str] = {
|
LANG_MAP,
|
||||||
"日本語": "japanese",
|
anki_request,
|
||||||
"Español": "spanish",
|
)
|
||||||
# Add more mappings here
|
|
||||||
}
|
|
||||||
|
|
||||||
# Map CLI lang code -> language bucket
|
|
||||||
lang_map: Dict[str, str] = {
|
|
||||||
"jp": "japanese",
|
|
||||||
"es": "spanish",
|
|
||||||
}
|
|
||||||
|
|
||||||
# If Anki is installed as a flatpak, media dir is typically:
|
|
||||||
media_dir = os.path.expanduser("~/.var/app/net.ankiweb.Anki/data/Anki2/User 1/collection.media")
|
|
||||||
|
|
||||||
# Default export root (can be overridden by --outdir)
|
|
||||||
output_root = os.path.expanduser("~/Languages/Anki/anki-audio")
|
|
||||||
|
|
||||||
AUDIO_EXTS = (".mp3", ".wav", ".ogg", ".m4a", ".flac")
|
AUDIO_EXTS = (".mp3", ".wav", ".ogg", ".m4a", ".flac")
|
||||||
|
|
||||||
|
|
||||||
def anki_request(action: str, **params):
|
|
||||||
"""Make an AnkiConnect request and return 'result'. Raise on error."""
|
|
||||||
resp = requests.post(
|
|
||||||
"http://localhost:8765",
|
|
||||||
json={"action": action, "version": 6, "params": params},
|
|
||||||
timeout=30,
|
|
||||||
)
|
|
||||||
resp.raise_for_status()
|
|
||||||
data = resp.json()
|
|
||||||
if data.get("error") is not None:
|
|
||||||
raise RuntimeError(f"AnkiConnect error for {action}: {data['error']}")
|
|
||||||
return data["result"]
|
|
||||||
|
|
||||||
|
|
||||||
def ensure_ffmpeg_available() -> None:
|
def ensure_ffmpeg_available() -> None:
|
||||||
"""Raise a helpful error if ffmpeg isn't installed."""
|
"""Raise a helpful error if ffmpeg isn't installed."""
|
||||||
if shutil.which("ffmpeg") is None:
|
if shutil.which("ffmpeg") is None:
|
||||||
raise RuntimeError("ffmpeg not found in PATH. Install ffmpeg to use --concat.")
|
raise RuntimeError("ffmpeg not found in PATH. Install ffmpeg to use --concat.")
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_media_paths(media_dir: str, out_dir: str, media_name: str) -> tuple[str, str] | None:
|
||||||
|
"""Return safe source/destination paths for an Anki media filename."""
|
||||||
|
normalized = os.path.normpath(media_name)
|
||||||
|
if os.path.isabs(normalized) or normalized.startswith(".."):
|
||||||
|
return None
|
||||||
|
return os.path.join(media_dir, normalized), os.path.join(out_dir, normalized)
|
||||||
|
|
||||||
|
|
||||||
def build_playlist(out_dir: str, language: str) -> str:
|
def build_playlist(out_dir: str, language: str) -> str:
|
||||||
"""
|
"""
|
||||||
Create an .m3u playlist listing audio files in out_dir (sorted by filename).
|
Create an .m3u playlist listing audio files under out_dir (sorted by filename).
|
||||||
Returns the playlist path.
|
Returns the playlist path.
|
||||||
"""
|
"""
|
||||||
m3u_path = os.path.join(out_dir, f"{language}.m3u")
|
m3u_path = os.path.join(out_dir, f"{language}.m3u")
|
||||||
files = sorted(
|
concat_name = f"{language}_concat.mp3"
|
||||||
f for f in os.listdir(out_dir)
|
files: List[str] = []
|
||||||
if f.lower().endswith(AUDIO_EXTS) and os.path.isfile(os.path.join(out_dir, f))
|
for root, _, filenames in os.walk(out_dir):
|
||||||
)
|
for fname in filenames:
|
||||||
|
abs_path = os.path.join(root, fname)
|
||||||
|
rel_path = os.path.relpath(abs_path, out_dir)
|
||||||
|
if rel_path == os.path.basename(m3u_path):
|
||||||
|
continue
|
||||||
|
if rel_path == concat_name:
|
||||||
|
continue
|
||||||
|
if fname.lower().endswith(AUDIO_EXTS) and os.path.isfile(abs_path):
|
||||||
|
files.append(rel_path)
|
||||||
|
|
||||||
with open(m3u_path, "w", encoding="utf-8") as fh:
|
with open(m3u_path, "w", encoding="utf-8") as fh:
|
||||||
for fname in files:
|
for fname in sorted(files):
|
||||||
fh.write(f"{fname}\n")
|
fh.write(f"{fname}\n")
|
||||||
|
|
||||||
return m3u_path
|
return m3u_path
|
||||||
@@ -156,7 +144,7 @@ def main() -> int:
|
|||||||
# REQUIRED positional language code: jp / es
|
# REQUIRED positional language code: jp / es
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"lang",
|
"lang",
|
||||||
choices=sorted(lang_map.keys()),
|
choices=sorted(LANG_MAP.keys()),
|
||||||
help="Language code (jp or es).",
|
help="Language code (jp or es).",
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -170,6 +158,11 @@ def main() -> int:
|
|||||||
"--outdir",
|
"--outdir",
|
||||||
help="Output directory. Default: ~/Languages/Anki/anki-audio/<language>",
|
help="Output directory. Default: ~/Languages/Anki/anki-audio/<language>",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--media-dir",
|
||||||
|
default=DEFAULT_ANKI_MEDIA_DIR,
|
||||||
|
help="Anki collection.media directory. Defaults to the common Flatpak profile path.",
|
||||||
|
)
|
||||||
|
|
||||||
# Keep your existing useful behavior
|
# Keep your existing useful behavior
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@@ -180,16 +173,17 @@ def main() -> int:
|
|||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
language = lang_map[args.lang]
|
language = LANG_MAP[args.lang]
|
||||||
|
media_dir = os.path.expanduser(args.media_dir)
|
||||||
|
|
||||||
# Find all decks whose mapped language matches
|
# Find all decks whose mapped language matches
|
||||||
selected_decks = [deck for deck, lang in deck_to_language.items() if lang == language]
|
selected_decks = [deck for deck, lang in DECK_TO_LANGUAGE.items() if lang == language]
|
||||||
if not selected_decks:
|
if not selected_decks:
|
||||||
print(f"No decks found for language: {language}", file=sys.stderr)
|
print(f"No decks found for language: {language}", file=sys.stderr)
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
# Output folder: either user-specified --outdir or default output_root/<language>
|
# Output folder: either user-specified --outdir or default output root/<language>
|
||||||
out_dir = os.path.expanduser(args.outdir) if args.outdir else os.path.join(output_root, language)
|
out_dir = os.path.expanduser(args.outdir) if args.outdir else os.path.join(DEFAULT_AUDIO_OUTPUT_ROOT, language)
|
||||||
os.makedirs(out_dir, exist_ok=True)
|
os.makedirs(out_dir, exist_ok=True)
|
||||||
|
|
||||||
# Collect note IDs across selected decks
|
# Collect note IDs across selected decks
|
||||||
@@ -212,8 +206,11 @@ def main() -> int:
|
|||||||
for field in fields.values():
|
for field in fields.values():
|
||||||
val = field.get("value", "") or ""
|
val = field.get("value", "") or ""
|
||||||
for match in re.findall(r"\[sound:(.+?)\]", val):
|
for match in re.findall(r"\[sound:(.+?)\]", val):
|
||||||
src = os.path.join(media_dir, match)
|
paths = resolve_media_paths(media_dir, out_dir, match)
|
||||||
dst = os.path.join(out_dir, match)
|
if paths is None:
|
||||||
|
print(f"Skipping unsafe media reference: {match}", file=sys.stderr)
|
||||||
|
continue
|
||||||
|
src, dst = paths
|
||||||
|
|
||||||
if not os.path.exists(src):
|
if not os.path.exists(src):
|
||||||
continue
|
continue
|
||||||
@@ -229,7 +226,7 @@ def main() -> int:
|
|||||||
shutil.copy2(src, dst)
|
shutil.copy2(src, dst)
|
||||||
copied.append(match)
|
copied.append(match)
|
||||||
|
|
||||||
# Create playlist (top-level audio only; if you have subfolders, you can extend this)
|
# Create playlist, including audio in subfolders.
|
||||||
m3u_path = build_playlist(out_dir, language)
|
m3u_path = build_playlist(out_dir, language)
|
||||||
|
|
||||||
print(f"\n✅ Copied {len(copied)} files for {language}")
|
print(f"\n✅ Copied {len(copied)} files for {language}")
|
||||||
@@ -251,4 +248,3 @@ def main() -> int:
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
raise SystemExit(main())
|
raise SystemExit(main())
|
||||||
|
|
||||||
|
|||||||
@@ -60,19 +60,32 @@ done
|
|||||||
|
|
||||||
[[ -z "$lang" ]] && arg_error_missing_lang
|
[[ -z "$lang" ]] && arg_error_missing_lang
|
||||||
|
|
||||||
|
require_command() {
|
||||||
|
if ! command -v "$1" >/dev/null 2>&1; then
|
||||||
|
echo "$prog: error: required command not found: $1" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
require_command gtts-cli
|
||||||
|
require_command ffmpeg
|
||||||
|
require_command curl
|
||||||
|
require_command jq
|
||||||
|
|
||||||
# Build tags JSON array - text-to-speech is always included
|
# Build tags JSON array - text-to-speech is always included
|
||||||
TAGS='["text-to-speech"'
|
tags=("text-to-speech")
|
||||||
if [[ -n "$custom_tags" ]]; then
|
if [[ -n "$custom_tags" ]]; then
|
||||||
IFS=',' read -ra tag_array <<< "$custom_tags"
|
IFS=',' read -ra tag_array <<< "$custom_tags"
|
||||||
for tag in "${tag_array[@]}"; do
|
for tag in "${tag_array[@]}"; do
|
||||||
# Trim whitespace
|
# Trim whitespace
|
||||||
tag="$(echo -e "$tag" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')"
|
tag="$(printf '%s' "$tag" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')"
|
||||||
TAGS+=", \"$tag\""
|
[[ -n "$tag" ]] && tags+=("$tag")
|
||||||
done
|
done
|
||||||
else
|
else
|
||||||
TAGS+=', "AI-generated"'
|
tags+=("AI-generated")
|
||||||
fi
|
fi
|
||||||
TAGS+=']'
|
|
||||||
|
TAGS="$(printf '%s\n' "${tags[@]}" | jq -R . | jq -s .)"
|
||||||
|
|
||||||
case "$lang" in
|
case "$lang" in
|
||||||
jp)
|
jp)
|
||||||
@@ -93,8 +106,14 @@ esac
|
|||||||
|
|
||||||
count=0
|
count=0
|
||||||
|
|
||||||
|
if [[ ! -f "$SENTENCE_FILE" ]]; then
|
||||||
|
echo "$prog: error: sentence file not found: $SENTENCE_FILE" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
# Use a temporary directory to handle processing
|
# Use a temporary directory to handle processing
|
||||||
TEMP_DIR=$(mktemp -d)
|
TEMP_DIR=$(mktemp -d)
|
||||||
|
trap 'rm -rf "$TEMP_DIR"' EXIT
|
||||||
|
|
||||||
while IFS= read -r sentence || [[ -n "$sentence" ]]; do
|
while IFS= read -r sentence || [[ -n "$sentence" ]]; do
|
||||||
[[ -z "$sentence" ]] && continue
|
[[ -z "$sentence" ]] && continue
|
||||||
@@ -115,31 +134,39 @@ while IFS= read -r sentence || [[ -n "$sentence" ]]; do
|
|||||||
if ffmpeg -loglevel error -i "$RAW_OUTPUT" -filter:a "atempo=$TEMPO" -y "$OUTPUT_PATH" < /dev/null; then
|
if ffmpeg -loglevel error -i "$RAW_OUTPUT" -filter:a "atempo=$TEMPO" -y "$OUTPUT_PATH" < /dev/null; then
|
||||||
|
|
||||||
# 3. Add to Anki using the sped-up file
|
# 3. Add to Anki using the sped-up file
|
||||||
result=$(curl -s localhost:8765 -X POST -d "{
|
payload="$(jq -n \
|
||||||
\"action\": \"addNote\",
|
--arg deck "$DECK_NAME" \
|
||||||
\"version\": 6,
|
--arg sentence "$sentence" \
|
||||||
\"params\": {
|
--arg path "$OUTPUT_PATH" \
|
||||||
\"note\": {
|
--arg filename "${BASENAME}.mp3" \
|
||||||
\"deckName\": \"$DECK_NAME\",
|
--argjson tags "$TAGS" \
|
||||||
\"modelName\": \"Basic\",
|
'{
|
||||||
\"fields\": {
|
action: "addNote",
|
||||||
\"Front\": \"\",
|
version: 6,
|
||||||
\"Back\": \"$sentence\"
|
params: {
|
||||||
},
|
note: {
|
||||||
\"options\": {
|
deckName: $deck,
|
||||||
\"allowDuplicate\": false
|
modelName: "Basic",
|
||||||
},
|
fields: {
|
||||||
\"tags\": $TAGS,
|
Front: "",
|
||||||
\"audio\": [{
|
Back: $sentence
|
||||||
\"path\": \"$OUTPUT_PATH\",
|
},
|
||||||
\"filename\": \"${BASENAME}.mp3\",
|
options: {
|
||||||
\"fields\": [\"Front\"]
|
allowDuplicate: false
|
||||||
}]
|
},
|
||||||
}
|
tags: $tags,
|
||||||
}
|
audio: [{
|
||||||
}")
|
path: $path,
|
||||||
|
filename: $filename,
|
||||||
|
fields: ["Front"]
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}')"
|
||||||
|
|
||||||
if [[ "$result" == *'"error": null'* ]]; then
|
result=$(curl -s localhost:8765 -X POST -H "Content-Type: application/json" -d "$payload")
|
||||||
|
|
||||||
|
if jq -e '.error == null' >/dev/null 2>&1 <<< "$result"; then
|
||||||
echo "✅ Added card: $sentence"
|
echo "✅ Added card: $sentence"
|
||||||
((count++))
|
((count++))
|
||||||
else
|
else
|
||||||
@@ -158,7 +185,4 @@ while IFS= read -r sentence || [[ -n "$sentence" ]]; do
|
|||||||
|
|
||||||
done <"$SENTENCE_FILE"
|
done <"$SENTENCE_FILE"
|
||||||
|
|
||||||
# Cleanup temp directory
|
echo "🎉 Done! Added $count cards to deck \"$DECK_NAME\"."
|
||||||
rm -rf "$TEMP_DIR"
|
|
||||||
|
|
||||||
echo "🎉 Done! Added $count cards to deck \"$DECK_NAME\"."
|
|
||||||
|
|||||||
8
requirements.txt
Normal file
8
requirements.txt
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
requests
|
||||||
|
regex
|
||||||
|
spacy
|
||||||
|
youtube-transcript-api
|
||||||
|
fugashi[unidic-lite]
|
||||||
|
gTTS
|
||||||
|
pyyaml
|
||||||
|
genanki
|
||||||
@@ -1,15 +1,15 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""
|
"""
|
||||||
word_extractor.py
|
word_scraper.py
|
||||||
|
|
||||||
Extract frequent words/lemmas from Anki notes via AnkiConnect.
|
Extract frequent words/lemmas from Anki notes via AnkiConnect.
|
||||||
|
|
||||||
Howto:
|
Howto:
|
||||||
./word_extractor.py jp [--deck "日本語"] [--field Back] [--min-freq 2] [--outdir DIR] [--out FILE]
|
./word_scraper.py jp [--deck "日本語"] [--field Back] [--min-freq 2] [--outdir DIR] [--out FILE]
|
||||||
./word_extractor.py es [--deck "Español"] [--field Back] [--min-freq 2] [--outdir DIR] [--out FILE]
|
./word_scraper.py es [--deck "Español"] [--field Back] [--min-freq 2] [--outdir DIR] [--out FILE]
|
||||||
|
|
||||||
By default, this:
|
By default, this:
|
||||||
- chooses decks based on the lang code (jp/es) using deck_to_language mappings
|
- chooses decks based on the lang code (jp/es) using shared deck mappings
|
||||||
- pulls notes from Anki via AnkiConnect (http://localhost:8765)
|
- pulls notes from Anki via AnkiConnect (http://localhost:8765)
|
||||||
- reads a single field (default: Back)
|
- reads a single field (default: Back)
|
||||||
- extracts the first visible line (HTML stripped) from that field
|
- extracts the first visible line (HTML stripped) from that field
|
||||||
@@ -29,30 +29,11 @@ import os
|
|||||||
import sys
|
import sys
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from html import unescape
|
from html import unescape
|
||||||
from typing import Callable, Dict, Iterable, List, Optional, Tuple
|
from typing import Callable, List
|
||||||
|
|
||||||
import requests
|
|
||||||
import regex as re
|
import regex as re
|
||||||
|
|
||||||
|
from anki_common import DEFAULT_WORD_OUTPUT_ROOT, DECK_TO_LANGUAGE, LANG_MAP, anki_request
|
||||||
# -------------------------
|
|
||||||
# Shared “language plumbing”
|
|
||||||
# -------------------------
|
|
||||||
# Match the idea used in audio_extractor.py: CLI lang code -> language bucket. :contentReference[oaicite:2]{index=2}
|
|
||||||
LANG_MAP: Dict[str, str] = {
|
|
||||||
"jp": "japanese",
|
|
||||||
"es": "spanish",
|
|
||||||
}
|
|
||||||
|
|
||||||
# Map deck name -> language bucket (same pattern as audio_extractor.py). :contentReference[oaicite:3]{index=3}
|
|
||||||
DECK_TO_LANGUAGE: Dict[str, str] = {
|
|
||||||
"日本語": "japanese",
|
|
||||||
"Español": "spanish",
|
|
||||||
# Add more deck mappings here
|
|
||||||
}
|
|
||||||
|
|
||||||
# Default output root (mirrors the “one folder per language” idea)
|
|
||||||
DEFAULT_OUTPUT_ROOT = os.path.expanduser("~/Languages/Anki/anki-words")
|
|
||||||
|
|
||||||
|
|
||||||
# -------------------------
|
# -------------------------
|
||||||
@@ -90,26 +71,6 @@ def extract_visible_text(text: str) -> str:
|
|||||||
return text.strip()
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
# -------------------------
|
|
||||||
# AnkiConnect helper
|
|
||||||
# -------------------------
|
|
||||||
def anki_request(action: str, **params):
|
|
||||||
"""
|
|
||||||
Make an AnkiConnect request and return 'result'.
|
|
||||||
Raises a helpful error if the HTTP call fails or AnkiConnect returns an error.
|
|
||||||
"""
|
|
||||||
resp = requests.post(
|
|
||||||
"http://localhost:8765",
|
|
||||||
json={"action": action, "version": 6, "params": params},
|
|
||||||
timeout=30,
|
|
||||||
)
|
|
||||||
resp.raise_for_status()
|
|
||||||
data = resp.json()
|
|
||||||
if data.get("error") is not None:
|
|
||||||
raise RuntimeError(f"AnkiConnect error for {action}: {data['error']}")
|
|
||||||
return data["result"]
|
|
||||||
|
|
||||||
|
|
||||||
def get_notes(query: str) -> List[dict]:
|
def get_notes(query: str) -> List[dict]:
|
||||||
"""
|
"""
|
||||||
Query Anki for notes and return notesInfo payload.
|
Query Anki for notes and return notesInfo payload.
|
||||||
@@ -333,7 +294,7 @@ def main() -> int:
|
|||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--logfile",
|
"--logfile",
|
||||||
default=os.path.expanduser("~/Languages/Anki/anki-words/extract_words.log"),
|
default=os.path.join(DEFAULT_WORD_OUTPUT_ROOT, "extract_words.log"),
|
||||||
help="Log file path.",
|
help="Log file path.",
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -361,7 +322,7 @@ def main() -> int:
|
|||||||
query = build_query_from_decks(decks)
|
query = build_query_from_decks(decks)
|
||||||
|
|
||||||
# Output paths
|
# Output paths
|
||||||
out_dir = os.path.expanduser(args.outdir) if args.outdir else os.path.join(DEFAULT_OUTPUT_ROOT, language_bucket)
|
out_dir = os.path.expanduser(args.outdir) if args.outdir else os.path.join(DEFAULT_WORD_OUTPUT_ROOT, language_bucket)
|
||||||
default_outfile = os.path.join(out_dir, f"words_{args.lang}.txt")
|
default_outfile = os.path.join(out_dir, f"words_{args.lang}.txt")
|
||||||
out_path = os.path.expanduser(args.out) if args.out else default_outfile
|
out_path = os.path.expanduser(args.out) if args.out else default_outfile
|
||||||
|
|
||||||
@@ -419,4 +380,3 @@ def main() -> int:
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
raise SystemExit(main())
|
raise SystemExit(main())
|
||||||
|
|
||||||
|
|||||||
@@ -28,14 +28,7 @@ from urllib.parse import urlparse, parse_qs
|
|||||||
|
|
||||||
from youtube_transcript_api import YouTubeTranscriptApi
|
from youtube_transcript_api import YouTubeTranscriptApi
|
||||||
|
|
||||||
|
from anki_common import TRANSCRIPT_LANG_MAP
|
||||||
# -------------------------
|
|
||||||
# Language mapping
|
|
||||||
# -------------------------
|
|
||||||
LANG_MAP = {
|
|
||||||
"jp": "ja",
|
|
||||||
"es": "es",
|
|
||||||
}
|
|
||||||
|
|
||||||
# Small starter stopword lists (you can grow these over time)
|
# Small starter stopword lists (you can grow these over time)
|
||||||
STOPWORDS = {
|
STOPWORDS = {
|
||||||
@@ -160,7 +153,7 @@ def main() -> int:
|
|||||||
)
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
lang_code = LANG_MAP[args.lang]
|
lang_code = TRANSCRIPT_LANG_MAP[args.lang]
|
||||||
video_id = extract_video_id(args.video)
|
video_id = extract_video_id(args.video)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -196,4 +189,3 @@ def main() -> int:
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
raise SystemExit(main())
|
raise SystemExit(main())
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user