Files
Saiki/saiki/text.py
2026-05-26 18:09:26 -04:00

30 lines
796 B
Python

"""Text cleanup helpers shared by tools."""
from __future__ import annotations
from html import unescape
import regex as re
def extract_first_visible_line(text: str) -> str:
text = unescape(text or "")
text = re.sub(r"</?(br|div|p)[^>]*>", "\n", text, flags=re.IGNORECASE)
text = re.sub(r"<[^>]+>", "", text)
text = text.strip()
return text.splitlines()[0] if text else ""
def extract_visible_text(text: str) -> str:
text = unescape(text or "")
text = re.sub(r"</?(br|div|p)[^>]*>", "\n", text, flags=re.IGNORECASE)
text = re.sub(r"<[^>]+>", "", text)
text = re.sub(r"[ \t]+", " ", text)
text = re.sub(r"\n{2,}", "\n", text)
return text.strip()
def normalize_word_key(value: str) -> str:
return re.sub(r"\s+", " ", value.strip().lower())