Rename project to Saiki and unify CLI

This commit is contained in:
Pawel
2026-05-26 18:09:26 -04:00
parent 8ee1f8de25
commit f38030238c
19 changed files with 1274 additions and 1326 deletions

29
saiki/text.py Normal file
View File

@@ -0,0 +1,29 @@
"""Text cleanup helpers shared by tools."""
from __future__ import annotations
from html import unescape
import regex as re
def extract_first_visible_line(text: str) -> str:
text = unescape(text or "")
text = re.sub(r"</?(br|div|p)[^>]*>", "\n", text, flags=re.IGNORECASE)
text = re.sub(r"<[^>]+>", "", text)
text = text.strip()
return text.splitlines()[0] if text else ""
def extract_visible_text(text: str) -> str:
text = unescape(text or "")
text = re.sub(r"</?(br|div|p)[^>]*>", "\n", text, flags=re.IGNORECASE)
text = re.sub(r"<[^>]+>", "", text)
text = re.sub(r"[ \t]+", " ", text)
text = re.sub(r"\n{2,}", "\n", text)
return text.strip()
def normalize_word_key(value: str) -> str:
return re.sub(r"\s+", " ", value.strip().lower())