#!/usr/bin/env python3 """ audio_extractor.py Extract all Anki media referenced by [sound:...] tags from one or more decks (grouped by language), copy them into a language-specific output folder, write an .m3u playlist, and optionally concatenate all audio into a single MP3 file. Howto: ./audio_extractor.py jp [--concat] [--outdir DIR] [--copy-only-new] ./audio_extractor.py es [--concat] [--outdir DIR] [--copy-only-new] Requirements: - Anki running + AnkiConnect enabled at http://localhost:8765 - Python package: requests - OPTIONAL (for --concat): ffmpeg Notes: - This scans all fields of each note and extracts filenames inside [sound:...] - It copies referenced media files out of Anki's collection.media folder - It preserves filenames (and subfolders if they exist) """ import os import re import sys import argparse import shutil import subprocess import tempfile from typing import List from anki_common import ( DEFAULT_ANKI_MEDIA_DIR, DEFAULT_AUDIO_OUTPUT_ROOT, DECK_TO_LANGUAGE, LANG_MAP, anki_request, ) AUDIO_EXTS = (".mp3", ".wav", ".ogg", ".m4a", ".flac") def ensure_ffmpeg_available() -> None: """Raise a helpful error if ffmpeg isn't installed.""" if shutil.which("ffmpeg") is None: raise RuntimeError("ffmpeg not found in PATH. Install ffmpeg to use --concat.") def resolve_media_paths(media_dir: str, out_dir: str, media_name: str) -> tuple[str, str] | None: """Return safe source/destination paths for an Anki media filename.""" normalized = os.path.normpath(media_name) if os.path.isabs(normalized) or normalized.startswith(".."): return None return os.path.join(media_dir, normalized), os.path.join(out_dir, normalized) def build_playlist(out_dir: str, language: str) -> str: """ Create an .m3u playlist listing audio files under out_dir (sorted by filename). Returns the playlist path. """ m3u_path = os.path.join(out_dir, f"{language}.m3u") concat_name = f"{language}_concat.mp3" files: List[str] = [] for root, _, filenames in os.walk(out_dir): for fname in filenames: abs_path = os.path.join(root, fname) rel_path = os.path.relpath(abs_path, out_dir) if rel_path == os.path.basename(m3u_path): continue if rel_path == concat_name: continue if fname.lower().endswith(AUDIO_EXTS) and os.path.isfile(abs_path): files.append(rel_path) with open(m3u_path, "w", encoding="utf-8") as fh: for fname in sorted(files): fh.write(f"{fname}\n") return m3u_path def concat_audio_from_m3u(out_dir: str, m3u_path: str, out_path: str) -> None: """ Concatenate audio files in the order listed in the .m3u. Uses ffmpeg concat demuxer and re-encodes to MP3 for reliability. Keeps original files untouched. """ ensure_ffmpeg_available() # Read playlist entries (filenames, one per line) with open(m3u_path, "r", encoding="utf-8") as fh: rel_files = [line.strip() for line in fh if line.strip()] # Filter to existing audio files abs_files: List[str] = [] for rel in rel_files: p = os.path.join(out_dir, rel) if os.path.isfile(p) and rel.lower().endswith(AUDIO_EXTS): abs_files.append(os.path.abspath(p)) if not abs_files: raise RuntimeError("No audio files found to concatenate (playlist is empty?).") # ffmpeg concat demuxer expects a file with lines like: file '/abs/path/to/file' # Use a temp file so we don't leave junk behind if ffmpeg fails. with tempfile.NamedTemporaryFile("w", delete=False, encoding="utf-8") as tmp: concat_list_path = tmp.name for p in abs_files: # Escape single quotes for ffmpeg concat list safe = p.replace("'", "'\\''") tmp.write(f"file '{safe}'\n") # Re-encode to MP3 to avoid header/codec mismatches across files cmd = [ "ffmpeg", "-hide_banner", "-loglevel", "error", "-f", "concat", "-safe", "0", "-i", concat_list_path, "-c:a", "libmp3lame", "-q:a", "4", "-y", out_path, ] try: subprocess.run(cmd, check=True) finally: try: os.remove(concat_list_path) except OSError: pass def main() -> int: parser = argparse.ArgumentParser( description="Extract Anki audio by language." ) # REQUIRED positional language code: jp / es parser.add_argument( "lang", choices=sorted(LANG_MAP.keys()), help="Language code (jp or es).", ) # Match bash-style flags parser.add_argument( "--concat", action="store_true", help="Also output a single concatenated MP3 file (in playlist order).", ) parser.add_argument( "--outdir", help="Output directory. Default: ~/Languages/Anki/anki-audio/", ) parser.add_argument( "--media-dir", default=DEFAULT_ANKI_MEDIA_DIR, help="Anki collection.media directory. Defaults to the common Flatpak profile path.", ) # Keep your existing useful behavior parser.add_argument( "--copy-only-new", action="store_true", help="Skip overwriting existing files.", ) args = parser.parse_args() language = LANG_MAP[args.lang] media_dir = os.path.expanduser(args.media_dir) # Find all decks whose mapped language matches selected_decks = [deck for deck, lang in DECK_TO_LANGUAGE.items() if lang == language] if not selected_decks: print(f"No decks found for language: {language}", file=sys.stderr) return 1 # Output folder: either user-specified --outdir or default output root/ out_dir = os.path.expanduser(args.outdir) if args.outdir else os.path.join(DEFAULT_AUDIO_OUTPUT_ROOT, language) os.makedirs(out_dir, exist_ok=True) # Collect note IDs across selected decks all_ids: List[int] = [] for deck in selected_decks: ids = anki_request("findNotes", query=f'deck:"{deck}"') all_ids.extend(ids) if not all_ids: print(f"No notes found in decks for language: {language}") return 0 # Fetch notes info (fields contain [sound:...] references) notes = anki_request("notesInfo", notes=all_ids) # Copy referenced audio files into out_dir copied: List[str] = [] for note in notes: fields = note.get("fields", {}) for field in fields.values(): val = field.get("value", "") or "" for match in re.findall(r"\[sound:(.+?)\]", val): paths = resolve_media_paths(media_dir, out_dir, match) if paths is None: print(f"Skipping unsafe media reference: {match}", file=sys.stderr) continue src, dst = paths if not os.path.exists(src): continue # If Anki stored media in subfolders, ensure the subfolder exists in out_dir dst_parent = os.path.dirname(dst) if dst_parent: os.makedirs(dst_parent, exist_ok=True) if args.copy_only_new and os.path.exists(dst): continue shutil.copy2(src, dst) copied.append(match) # Create playlist, including audio in subfolders. m3u_path = build_playlist(out_dir, language) print(f"\nāœ… Copied {len(copied)} files for {language}") print(f"šŸŽµ Playlist created at: {m3u_path}") print(f"šŸ“ Output directory: {out_dir}") # Optional: concatenate all audio into one MP3 (order = playlist order) if args.concat: concat_out = os.path.join(out_dir, f"{language}_concat.mp3") try: concat_audio_from_m3u(out_dir, m3u_path, concat_out) print(f"šŸŽ§ Concatenated file created at: {concat_out}") except Exception as e: print(f"āŒ Concatenation failed: {e}", file=sys.stderr) return 1 return 0 if __name__ == "__main__": raise SystemExit(main())