Rename project to Saiki and unify CLI
This commit is contained in:
179
saiki/youtube.py
Normal file
179
saiki/youtube.py
Normal file
@@ -0,0 +1,179 @@
|
||||
"""YouTube transcript mining and Anki-ready exports."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import os
|
||||
import re
|
||||
from collections import Counter
|
||||
from dataclasses import dataclass
|
||||
from urllib.parse import parse_qs, urlparse
|
||||
|
||||
from youtube_transcript_api import YouTubeTranscriptApi
|
||||
|
||||
from .config import Config
|
||||
from .text import normalize_word_key
|
||||
from .words import read_word_file
|
||||
|
||||
STOPWORDS = {
|
||||
"es": {
|
||||
"de", "la", "que", "el", "en", "y", "a", "los", "del", "se", "las", "por",
|
||||
"un", "para", "con", "no", "una", "su", "al", "lo", "como",
|
||||
},
|
||||
"en": {"the", "is", "and", "of", "to", "in", "it", "that", "on", "you", "this", "for", "with"},
|
||||
"ja": {"の", "に", "は", "を", "た", "が", "で", "て", "です", "ます", "する", "ある", "いる"},
|
||||
}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class TranscriptLine:
|
||||
start: float
|
||||
text: str
|
||||
|
||||
|
||||
def extract_video_id(url_or_id: str) -> str:
|
||||
if "youtube" in url_or_id or "youtu.be" in url_or_id:
|
||||
query = urlparse(url_or_id)
|
||||
if query.hostname == "youtu.be":
|
||||
return query.path.lstrip("/")
|
||||
if query.hostname in ("www.youtube.com", "youtube.com", "m.youtube.com"):
|
||||
values = parse_qs(query.query).get("v", [])
|
||||
if values:
|
||||
return values[0]
|
||||
return url_or_id
|
||||
|
||||
|
||||
def video_url(video_or_id: str) -> str:
|
||||
video_id = extract_video_id(video_or_id)
|
||||
return f"https://www.youtube.com/watch?v={video_id}"
|
||||
|
||||
|
||||
def fetch_transcript(video_id: str, lang_code: str):
|
||||
if hasattr(YouTubeTranscriptApi, "fetch"):
|
||||
api = YouTubeTranscriptApi()
|
||||
return api.fetch(video_id, languages=[lang_code])
|
||||
if hasattr(YouTubeTranscriptApi, "get_transcript"):
|
||||
return YouTubeTranscriptApi.get_transcript(video_id, languages=[lang_code])
|
||||
raise RuntimeError("Unsupported youtube-transcript-api version.")
|
||||
|
||||
|
||||
def snippet_text(entry) -> str:
|
||||
if isinstance(entry, dict):
|
||||
return entry.get("text", "") or ""
|
||||
return getattr(entry, "text", "") or ""
|
||||
|
||||
|
||||
def snippet_start(entry) -> float:
|
||||
if isinstance(entry, dict):
|
||||
return float(entry.get("start", 0.0) or 0.0)
|
||||
return float(getattr(entry, "start", 0.0) or 0.0)
|
||||
|
||||
|
||||
def transcript_lines(entries) -> list[TranscriptLine]:
|
||||
lines: list[TranscriptLine] = []
|
||||
for entry in entries:
|
||||
text = snippet_text(entry).replace("\n", " ").strip()
|
||||
if text:
|
||||
lines.append(TranscriptLine(snippet_start(entry), text))
|
||||
return lines
|
||||
|
||||
|
||||
def tokenize_japanese(text: str) -> list[str]:
|
||||
try:
|
||||
from fugashi import Tagger
|
||||
except ImportError as e:
|
||||
raise RuntimeError('Japanese requires fugashi. Install: pip install "fugashi[unidic-lite]"') from e
|
||||
tagger = Tagger()
|
||||
return [w.surface for w in tagger(text)]
|
||||
|
||||
|
||||
def tokenize_spanish(text: str, raw: bool = False) -> list[str]:
|
||||
tokens = re.findall(r"\b[\wáéíóúñü]+\b", text)
|
||||
return tokens if raw else [t.lower() for t in tokens]
|
||||
|
||||
|
||||
def tokenize_text(text: str, lang_code: str, raw: bool = False) -> list[str]:
|
||||
return tokenize_japanese(text) if lang_code == "ja" else tokenize_spanish(text, raw=raw)
|
||||
|
||||
|
||||
def count_words(tokens: list[str], lang_code: str, remove_stopwords: bool = True) -> Counter:
|
||||
if remove_stopwords:
|
||||
stopwords = STOPWORDS.get(lang_code, set())
|
||||
tokens = [t for t in tokens if t not in stopwords]
|
||||
return Counter(tokens)
|
||||
|
||||
|
||||
def sentence_vocab(sentence: str, lang_code: str, known_words: set[str] | None = None) -> list[str]:
|
||||
words: list[str] = []
|
||||
seen: set[str] = set()
|
||||
for token in tokenize_text(sentence, lang_code):
|
||||
key = normalize_word_key(token)
|
||||
if key in seen or key in STOPWORDS.get(lang_code, set()):
|
||||
continue
|
||||
if known_words is not None and key in known_words:
|
||||
continue
|
||||
seen.add(key)
|
||||
words.append(token)
|
||||
return words
|
||||
|
||||
|
||||
def write_sentence_export(
|
||||
lines: list[TranscriptLine],
|
||||
out_path: str,
|
||||
video: str,
|
||||
lang_code: str,
|
||||
delimiter: str = "\t",
|
||||
known_words_path: str | None = None,
|
||||
only_new: bool = False,
|
||||
) -> int:
|
||||
known = read_word_file(known_words_path) if known_words_path else None
|
||||
os.makedirs(os.path.dirname(os.path.abspath(out_path)), exist_ok=True)
|
||||
written = 0
|
||||
with open(out_path, "w", encoding="utf-8", newline="") as fh:
|
||||
writer = csv.writer(fh, delimiter=delimiter)
|
||||
writer.writerow(["sentence", "timestamp", "video_url", "vocab_guess"])
|
||||
for line in lines:
|
||||
vocab = sentence_vocab(line.text, lang_code, known)
|
||||
if only_new and not vocab:
|
||||
continue
|
||||
writer.writerow([line.text, f"{line.start:.2f}", video_url(video), ", ".join(vocab)])
|
||||
written += 1
|
||||
return written
|
||||
|
||||
|
||||
def run_youtube(
|
||||
config: Config,
|
||||
lang: str,
|
||||
video: str,
|
||||
mode: str = "vocab",
|
||||
top: int | None = None,
|
||||
no_stopwords: bool = False,
|
||||
raw: bool = False,
|
||||
out: str | None = None,
|
||||
fmt: str = "tsv",
|
||||
known_words: str | None = None,
|
||||
only_new: bool = False,
|
||||
) -> dict[str, object]:
|
||||
lang_code = config.transcript_code(lang)
|
||||
video_id = extract_video_id(video)
|
||||
entries = fetch_transcript(video_id, lang_code)
|
||||
lines = transcript_lines(entries)
|
||||
|
||||
if mode == "sentences":
|
||||
if out:
|
||||
delimiter = "," if fmt == "csv" else "\t"
|
||||
written = write_sentence_export(lines, out, video_id, lang_code, delimiter, known_words, only_new)
|
||||
return {"mode": mode, "lines": len(lines), "written": written, "out": out}
|
||||
return {"mode": mode, "lines": lines}
|
||||
|
||||
text = " ".join(line.text for line in lines)
|
||||
tokens = tokenize_text(text, lang_code, raw=raw)
|
||||
counts = count_words(tokens, lang_code, remove_stopwords=not no_stopwords)
|
||||
items = counts.most_common(top) if top else counts.most_common()
|
||||
if out:
|
||||
os.makedirs(os.path.dirname(os.path.abspath(out)), exist_ok=True)
|
||||
with open(out, "w", encoding="utf-8") as fh:
|
||||
for word, count in items:
|
||||
fh.write(f"{word} {count}\n")
|
||||
return {"mode": mode, "items": items, "out": out}
|
||||
|
||||
Reference in New Issue
Block a user