Added & fixed some documentation
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
"""PyTorch dataset adapters for tokenized climbing-board routes."""
|
||||
from __future__ import annotations
|
||||
|
||||
import torch
|
||||
@@ -5,7 +6,15 @@ from torch.utils.data import Dataset
|
||||
|
||||
|
||||
class RouteGradeDataset(Dataset):
|
||||
"""Dataset for transformer encoder grade prediction.
|
||||
|
||||
Each item returns a padded token sequence, a boolean attention mask, the
|
||||
continuous display-difficulty target, and a small amount of route identity
|
||||
metadata used when writing prediction CSVs.
|
||||
"""
|
||||
|
||||
def __init__(self, df, max_len: int, pad_id: int):
|
||||
"""Store model IDs and labels from a tokenized route DataFrame."""
|
||||
self.row_ids = df["row_id"].tolist() if "row_id" in df.columns else df.index.tolist()
|
||||
self.ids = df["model_ids"].tolist()
|
||||
self.targets = df["display_difficulty"].astype(float).values
|
||||
@@ -15,9 +24,11 @@ class RouteGradeDataset(Dataset):
|
||||
self.pad_id = int(pad_id)
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""Return the number of route examples."""
|
||||
return len(self.ids)
|
||||
|
||||
def __getitem__(self, idx: int):
|
||||
"""Return one padded encoder example and its regression target."""
|
||||
ids = list(self.ids[idx])[: self.max_len]
|
||||
mask = [1] * len(ids)
|
||||
if len(ids) < self.max_len:
|
||||
@@ -36,15 +47,25 @@ class RouteGradeDataset(Dataset):
|
||||
|
||||
|
||||
class RouteGPTDataset(Dataset):
|
||||
"""Dataset for causal next-token route generation.
|
||||
|
||||
The full sequence is padded once, then split into ``input_ids`` and
|
||||
``target_ids`` shifted by one position for teacher-forced language-model
|
||||
training.
|
||||
"""
|
||||
|
||||
def __init__(self, df, max_len: int, pad_id: int):
|
||||
"""Store GPT token ID sequences from a tokenized route DataFrame."""
|
||||
self.ids = df["gpt_ids"].tolist()
|
||||
self.max_len = int(max_len)
|
||||
self.pad_id = int(pad_id)
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""Return the number of route examples."""
|
||||
return len(self.ids)
|
||||
|
||||
def __getitem__(self, idx: int):
|
||||
"""Return one padded causal-language-model training example."""
|
||||
ids = list(self.ids[idx])[: self.max_len]
|
||||
if len(ids) < self.max_len:
|
||||
ids += [self.pad_id] * (self.max_len - len(ids))
|
||||
|
||||
@@ -1,3 +1,9 @@
|
||||
"""Evaluation utilities for generated climbing-board routes.
|
||||
|
||||
The helpers in this module are intentionally model-agnostic: they work from
|
||||
tokens, frames strings, and token metadata so notebooks, scripts, and tests can
|
||||
reuse the same route validity, novelty, and geometry calculations.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
@@ -11,10 +17,12 @@ from .tokenization import parse_tokens, tokens_to_hold_records
|
||||
|
||||
|
||||
def parse_token_list(value) -> list[str]:
|
||||
"""Compatibility wrapper around the shared token parser."""
|
||||
return parse_tokens(value)
|
||||
|
||||
|
||||
def validity_from_records(records: list[dict[str, object]], requested_board_prefix: str | None = None) -> dict[str, object]:
|
||||
"""Compute evaluation-specific route-validity flags from hold records."""
|
||||
placements = [int(record["placement_id"]) for record in records]
|
||||
roles = [str(record["role"]) for record in records]
|
||||
prefixes = [str(record["board_token_prefix"]) for record in records]
|
||||
@@ -51,16 +59,19 @@ def validity_from_records(records: list[dict[str, object]], requested_board_pref
|
||||
|
||||
|
||||
def frames_to_holds(frames: str | None) -> list[tuple[int, int]]:
|
||||
"""Parse a frames string into ``(placement_id, role_id)`` pairs."""
|
||||
if not isinstance(frames, str):
|
||||
return []
|
||||
return [(int(p), int(r)) for p, r in re.findall(r"p(\d+)r(\d+)", frames)]
|
||||
|
||||
|
||||
def holds_to_placement_set(holds: Iterable[tuple[int, int]]) -> frozenset[int]:
|
||||
"""Drop role IDs and represent a route by its unique placement IDs."""
|
||||
return frozenset(int(placement_id) for placement_id, _ in holds)
|
||||
|
||||
|
||||
def jaccard(a: frozenset[int], b: frozenset[int]) -> float:
|
||||
"""Return Jaccard similarity between two placement sets."""
|
||||
if not a and not b:
|
||||
return 1.0
|
||||
if not a or not b:
|
||||
@@ -73,6 +84,7 @@ def nearest_real_route_same_board(
|
||||
generated_board_key: str,
|
||||
real_df: pd.DataFrame,
|
||||
) -> dict[str, object]:
|
||||
"""Find the most similar real route on the same board by Jaccard score."""
|
||||
board_frame = real_df[real_df["board_key"] == generated_board_key]
|
||||
if board_frame.empty:
|
||||
return {
|
||||
@@ -100,6 +112,7 @@ def nearest_real_route_same_board(
|
||||
|
||||
|
||||
def build_placement_coords(df_token_meta: pd.DataFrame) -> dict[tuple[str, int], dict[str, float]]:
|
||||
"""Build a placement-coordinate lookup from token metadata."""
|
||||
hold_meta = df_token_meta[df_token_meta["kind"] == "hold"].dropna(subset=["placement_id"]).copy()
|
||||
coords = {}
|
||||
for _, row in hold_meta.drop_duplicates(["board_key", "placement_id"]).iterrows():
|
||||
@@ -116,6 +129,12 @@ def simple_route_features(
|
||||
records: list[dict[str, object]],
|
||||
placement_coords: dict[tuple[str, int], dict[str, float]],
|
||||
) -> dict[str, float]:
|
||||
"""Compute simple geometric route features from hold coordinates.
|
||||
|
||||
These features are descriptive rather than a full climbing-physics model:
|
||||
height/width describe route spread, and hand-reach distances summarize the
|
||||
pairwise spacing among start/middle/finish holds.
|
||||
"""
|
||||
rows = []
|
||||
for record in records:
|
||||
key = (str(board_key), int(record["placement_id"]))
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
"""Sampling and structural-validity helpers for route generation."""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Iterable
|
||||
@@ -9,6 +10,7 @@ from .tokenization import tokens_to_hold_records
|
||||
|
||||
|
||||
def top_k_filter(logits: torch.Tensor, k: int | None) -> torch.Tensor:
|
||||
"""Mask logits outside the top ``k`` choices for each batch row."""
|
||||
if k is None or k <= 0 or k >= logits.size(-1):
|
||||
return logits
|
||||
values, _ = torch.topk(logits, k)
|
||||
@@ -27,6 +29,11 @@ def sample_ids(
|
||||
eos_id: int | None = None,
|
||||
forbidden_ids: Iterable[int] | None = None,
|
||||
) -> list[int]:
|
||||
"""Autoregressively sample token IDs from a trained route generator.
|
||||
|
||||
The returned list includes the prompt IDs and all sampled IDs up to either
|
||||
``max_new_tokens`` or the first sampled ``eos_id``.
|
||||
"""
|
||||
model.eval()
|
||||
sequence = torch.tensor([prompt_ids], dtype=torch.long, device=device)
|
||||
forbidden_ids = set(forbidden_ids or [])
|
||||
@@ -36,6 +43,8 @@ def sample_ids(
|
||||
logits, _ = model(idx_cond)
|
||||
logits = logits[:, -1, :] / max(temperature, 1e-6)
|
||||
|
||||
# Special tokens like <PAD> and <CLS> are valid vocabulary entries but
|
||||
# should never be emitted in the middle of a generated climb.
|
||||
for token_id in forbidden_ids:
|
||||
logits[:, int(token_id)] = -float("inf")
|
||||
|
||||
@@ -51,6 +60,7 @@ def sample_ids(
|
||||
|
||||
|
||||
def prompt_tokens(board_prefix: str, angle: int, grouped_v: int) -> list[str]:
|
||||
"""Build the conditioning prefix used before sampling hold tokens."""
|
||||
return [
|
||||
"<BOS>",
|
||||
f"<BOARD_{board_prefix}>",
|
||||
@@ -60,10 +70,12 @@ def prompt_tokens(board_prefix: str, angle: int, grouped_v: int) -> list[str]:
|
||||
|
||||
|
||||
def hold_records(tokens: Iterable[str]) -> list[dict[str, object]]:
|
||||
"""Extract hold records from generated tokens."""
|
||||
return tokens_to_hold_records(tokens)
|
||||
|
||||
|
||||
def validity_summary(tokens: Iterable[str], requested_board_prefix: str | None = None) -> dict[str, object]:
|
||||
"""Summarize basic structural validity for generated token sequences."""
|
||||
records = hold_records(tokens)
|
||||
placements = [record["placement_id"] for record in records]
|
||||
roles = [record["role"] for record in records]
|
||||
@@ -94,6 +106,11 @@ def validity_summary(tokens: Iterable[str], requested_board_prefix: str | None =
|
||||
|
||||
|
||||
def generated_tokens_to_frames(tokens: Iterable[str], role_name_to_id: dict[str, int], board_prefix: str | None = None) -> str:
|
||||
"""Convert generated hold tokens back into a frames string.
|
||||
|
||||
Duplicate placements and unknown roles are skipped, matching the forgiving
|
||||
cleanup used by the demo scripts and webapp.
|
||||
"""
|
||||
pieces = []
|
||||
seen = set()
|
||||
for record in hold_records(tokens):
|
||||
@@ -121,6 +138,7 @@ def generate_one(
|
||||
top_k: int | None = 50,
|
||||
max_new_tokens: int = 40,
|
||||
) -> dict[str, object]:
|
||||
"""Generate one route and return tokens, frames, request metadata, validity."""
|
||||
unk_id = stoi["<UNK>"]
|
||||
eos_id = stoi["<EOS>"]
|
||||
forbidden_ids = [
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
"""Grade-scale helpers for BoardLib display difficulty and grouped V grades."""
|
||||
from __future__ import annotations
|
||||
|
||||
# BoardLib display difficulties are integer-like values. This project groups
|
||||
# them into V grades so TB2 and Kilter can share a compact grade-token space.
|
||||
GRADE_TO_V = {
|
||||
10: 0, 11: 0, 12: 0,
|
||||
13: 1, 14: 1,
|
||||
@@ -22,10 +25,12 @@ GRADE_TO_V = {
|
||||
|
||||
|
||||
def to_grouped_v(display_difficulty: float) -> int:
|
||||
"""Map a continuous display difficulty to the nearest grouped V grade."""
|
||||
rounded = int(round(float(display_difficulty)))
|
||||
rounded = max(min(rounded, max(GRADE_TO_V)), min(GRADE_TO_V))
|
||||
return GRADE_TO_V[rounded]
|
||||
|
||||
|
||||
def grade_token(display_difficulty: float) -> str:
|
||||
"""Return the grade-conditioning token for a display difficulty value."""
|
||||
return f"<GRADE_V{to_grouped_v(display_difficulty)}>"
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
"""Metrics used to evaluate continuous grade predictions."""
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
@@ -10,6 +11,7 @@ from .grades import to_grouped_v
|
||||
|
||||
|
||||
def regression_metrics(y_true, y_pred) -> dict[str, float]:
|
||||
"""Compute difficulty-scale and grouped-V-grade prediction metrics."""
|
||||
y_true = np.asarray(y_true)
|
||||
y_pred = np.asarray(y_pred)
|
||||
true_v = np.asarray([to_grouped_v(x) for x in y_true])
|
||||
@@ -28,6 +30,7 @@ def regression_metrics(y_true, y_pred) -> dict[str, float]:
|
||||
|
||||
|
||||
def metrics_by_board(pred_df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Compute regression metrics separately for each board in a prediction table."""
|
||||
rows = []
|
||||
for board_key, frame in pred_df.groupby("board_key"):
|
||||
metrics = regression_metrics(frame["y_true"].values, frame["y_pred"].values)
|
||||
@@ -36,6 +39,7 @@ def metrics_by_board(pred_df: pd.DataFrame) -> pd.DataFrame:
|
||||
|
||||
|
||||
def print_metrics(name: str, metrics: dict[str, float]) -> None:
|
||||
"""Pretty-print a metric dictionary in the training scripts."""
|
||||
print(name)
|
||||
print("-" * len(name))
|
||||
for key, value in metrics.items():
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
"""Neural network definitions for grade prediction and route generation."""
|
||||
from __future__ import annotations
|
||||
|
||||
import torch
|
||||
@@ -6,7 +7,13 @@ import torch.nn.functional as F
|
||||
|
||||
|
||||
class JointRouteTransformerRegressor(nn.Module):
|
||||
"""Transformer encoder for joint TB2/Kilter route difficulty prediction."""
|
||||
"""Transformer encoder for joint TB2/Kilter route difficulty prediction.
|
||||
|
||||
Inputs are token IDs plus an attention mask. Token, position, and learned
|
||||
projections of coordinate metadata are added before the encoder. The first
|
||||
``<CLS>`` position is then used as a pooled route representation for scalar
|
||||
difficulty regression.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -20,6 +27,7 @@ class JointRouteTransformerRegressor(nn.Module):
|
||||
dropout: float = 0.10,
|
||||
pad_id: int = 0,
|
||||
):
|
||||
"""Create the encoder, coordinate projection, and regression head."""
|
||||
super().__init__()
|
||||
self.vocab_size = vocab_size
|
||||
self.max_len = max_len
|
||||
@@ -55,9 +63,12 @@ class JointRouteTransformerRegressor(nn.Module):
|
||||
)
|
||||
|
||||
def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
|
||||
"""Return one continuous difficulty prediction per input sequence."""
|
||||
batch_size, seq_len = input_ids.shape
|
||||
positions = torch.arange(seq_len, device=input_ids.device).unsqueeze(0).expand(batch_size, seq_len)
|
||||
|
||||
# Coordinate features are indexed by token ID, so every occurrence of a
|
||||
# hold token gets the same physical x/y hint wherever it appears.
|
||||
x = self.token_emb(input_ids) + self.pos_emb(positions)
|
||||
x = x + self.coord_proj(self.coord_features[input_ids])
|
||||
|
||||
@@ -70,7 +81,11 @@ class JointRouteTransformerRegressor(nn.Module):
|
||||
|
||||
|
||||
class JointRouteGPT(nn.Module):
|
||||
"""Tiny GPT-style causal transformer for board-conditioned route generation."""
|
||||
"""Tiny GPT-style causal transformer for board-conditioned route generation.
|
||||
|
||||
PyTorch's ``TransformerEncoder`` is used with a causal mask, which makes it
|
||||
behave like a decoder-only language model for short route sequences.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -82,6 +97,7 @@ class JointRouteGPT(nn.Module):
|
||||
dropout: float = 0.10,
|
||||
pad_id: int = 0,
|
||||
):
|
||||
"""Create the token/position embeddings, causal blocks, and LM head."""
|
||||
super().__init__()
|
||||
self.vocab_size = vocab_size
|
||||
self.block_size = block_size
|
||||
@@ -114,6 +130,7 @@ class JointRouteGPT(nn.Module):
|
||||
idx: torch.Tensor,
|
||||
targets: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
"""Return next-token logits and, when targets are supplied, CE loss."""
|
||||
_, seq_len = idx.shape
|
||||
if seq_len > self.block_size:
|
||||
idx = idx[:, -self.block_size :]
|
||||
@@ -126,6 +143,8 @@ class JointRouteGPT(nn.Module):
|
||||
torch.ones(seq_len, seq_len, device=idx.device, dtype=torch.bool),
|
||||
diagonal=1,
|
||||
)
|
||||
# Padding masks suppress attention to right-padded context tokens while
|
||||
# the causal mask suppresses attention to future positions.
|
||||
key_padding_mask = idx.eq(self.pad_id)
|
||||
|
||||
h = self.blocks(
|
||||
|
||||
@@ -1,9 +1,16 @@
|
||||
"""Path discovery helpers for scripts that can be launched from any directory."""
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def find_project_root(start: str | Path | None = None) -> Path:
|
||||
"""Walk upward until the repository root markers are found.
|
||||
|
||||
The project root is identified by both ``pyproject.toml`` and ``configs``.
|
||||
If neither marker pair is found, the resolved starting directory is returned
|
||||
so callers still have a deterministic base path.
|
||||
"""
|
||||
current = Path(start).resolve() if start is not None else Path.cwd().resolve()
|
||||
for candidate in [current, *current.parents]:
|
||||
if (candidate / "pyproject.toml").exists() and (candidate / "configs").exists():
|
||||
|
||||
@@ -1,3 +1,10 @@
|
||||
"""Route tokenization helpers shared by training, evaluation, and demos.
|
||||
|
||||
The project represents every climb as a short symbolic sequence. Board,
|
||||
angle, grade, and hold-role information are all encoded as tokens, while hold
|
||||
tokens are namespaced by board so placement IDs from different products cannot
|
||||
collide.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
@@ -19,6 +26,8 @@ SPECIAL_TOKENS = [
|
||||
"<MASK>",
|
||||
]
|
||||
|
||||
# The token grammar is intentionally centralized here so training, generation,
|
||||
# evaluation, and the webapp parse the same strings in the same way.
|
||||
ANGLE_TOKEN_PATTERN = re.compile(r"^<ANGLE_(-?\d+)>$")
|
||||
GRADE_TOKEN_PATTERN = re.compile(r"^<GRADE_V(\d+)>$")
|
||||
BOARD_TOKEN_PATTERN = re.compile(r"^<BOARD_([A-Z0-9_]+)>$")
|
||||
@@ -34,6 +43,12 @@ ROLE_SORT_ORDER = {
|
||||
|
||||
|
||||
def parse_frames(frames_str: str | None) -> list[tuple[int, int]]:
|
||||
"""Parse a frames string into ``(placement_id, role_id)`` pairs.
|
||||
|
||||
Frames strings are compact concatenations such as ``p344r5p369r6``. Invalid
|
||||
or missing input returns an empty list so callers can skip unusable climbs
|
||||
without special-case exception handling.
|
||||
"""
|
||||
if not isinstance(frames_str, str):
|
||||
return []
|
||||
matches = re.findall(r"p(\d+)r(\d+)", frames_str)
|
||||
@@ -78,6 +93,7 @@ def tokens_to_hold_records(tokens: Iterable[str]) -> list[dict[str, object]]:
|
||||
|
||||
|
||||
def make_placement_lookup(df_placements: pd.DataFrame) -> dict[tuple[str, int], dict]:
|
||||
"""Build a coordinate/metadata lookup keyed by ``(board_key, placement_id)``."""
|
||||
rows = {}
|
||||
for _, row in df_placements.iterrows():
|
||||
key = (str(row["board_key"]), int(row["placement_id"]))
|
||||
@@ -86,6 +102,7 @@ def make_placement_lookup(df_placements: pd.DataFrame) -> dict[tuple[str, int],
|
||||
|
||||
|
||||
def role_name(role_id: int, config: BoardConfig) -> str:
|
||||
"""Map a board-specific numeric role ID to a shared semantic role name."""
|
||||
return config.role_id_to_name.get(int(role_id), "unknown")
|
||||
|
||||
|
||||
@@ -94,6 +111,7 @@ def placement_xy(
|
||||
placement_id: int,
|
||||
placement_lookup: dict[tuple[str, int], dict],
|
||||
) -> tuple[float, float]:
|
||||
"""Return raw board coordinates for a placement, or NaNs if unknown."""
|
||||
row = placement_lookup.get((str(board_key), int(placement_id)))
|
||||
if row is None:
|
||||
return (float("nan"), float("nan"))
|
||||
@@ -105,7 +123,15 @@ def canonicalize_holds(
|
||||
config: BoardConfig,
|
||||
placement_lookup: dict[tuple[str, int], dict],
|
||||
) -> list[tuple[int, int]]:
|
||||
"""Sort holds into the canonical route order used by all model inputs.
|
||||
|
||||
Frames preserve setter/storage order, which is not always stable
|
||||
across routes or boards. Canonical ordering keeps starts first, hand/foot
|
||||
holds in a bottom-to-top scan, and finishes last, giving the models a more
|
||||
consistent sequence grammar.
|
||||
"""
|
||||
def key(pair: tuple[int, int]):
|
||||
"""Sort by semantic role, then board position, then placement ID."""
|
||||
placement_id, role_id = pair
|
||||
x, y = placement_xy(config.board_key, placement_id, placement_lookup)
|
||||
name = role_name(role_id, config)
|
||||
@@ -120,10 +146,12 @@ def canonicalize_holds(
|
||||
|
||||
|
||||
def board_token(config: BoardConfig) -> str:
|
||||
"""Return the special conditioning token for a board config."""
|
||||
return f"<BOARD_{config.token_prefix}>"
|
||||
|
||||
|
||||
def angle_token(angle: float) -> str:
|
||||
"""Round a wall angle into the shared angle-token format."""
|
||||
return f"<ANGLE_{int(round(float(angle)))}>"
|
||||
|
||||
|
||||
@@ -132,6 +160,7 @@ def hold_token(
|
||||
role_id: int,
|
||||
config: BoardConfig,
|
||||
) -> str:
|
||||
"""Return a board-namespaced hold token for a placement and role."""
|
||||
semantic_role = role_name(role_id, config)
|
||||
return f"<{config.token_prefix}_p{int(placement_id)}_{semantic_role}>"
|
||||
|
||||
@@ -143,6 +172,12 @@ def tokenize_route(
|
||||
include_grade: bool = True,
|
||||
canonical: bool = True,
|
||||
) -> list[str]:
|
||||
"""Tokenize one climb row into the sequence consumed by the models.
|
||||
|
||||
``include_grade=True`` is used for GPT-style generation, where the target
|
||||
grade is a conditioning token. ``include_grade=False`` is used for grade
|
||||
prediction so the model cannot read the answer from its input.
|
||||
"""
|
||||
holds = parse_frames(row["frames"])
|
||||
if canonical:
|
||||
holds = canonicalize_holds(holds, config, placement_lookup)
|
||||
@@ -165,6 +200,12 @@ def build_route_records(
|
||||
configs_by_key: dict[str, BoardConfig],
|
||||
placement_lookup: dict[tuple[str, int], dict],
|
||||
) -> pd.DataFrame:
|
||||
"""Create one training/evaluation record per climb-angle row.
|
||||
|
||||
The returned frame keeps both human-readable route metadata and model-ready
|
||||
token sequences, which lets downstream scripts save compact CSV summaries
|
||||
while still retaining the richer JSONL training artifacts.
|
||||
"""
|
||||
records: list[dict] = []
|
||||
|
||||
for _, row in df_climbs.iterrows():
|
||||
@@ -230,6 +271,7 @@ def build_route_records(
|
||||
|
||||
|
||||
def build_vocab(df_routes: pd.DataFrame) -> tuple[list[str], dict[str, int], dict[int, str]]:
|
||||
"""Build the shared token vocabulary from grade-conditioned sequences."""
|
||||
all_tokens: list[str] = []
|
||||
for tokens in df_routes["tokens_with_grade"]:
|
||||
all_tokens.extend(tokens)
|
||||
@@ -245,11 +287,13 @@ def build_vocab(df_routes: pd.DataFrame) -> tuple[list[str], dict[str, int], dic
|
||||
|
||||
|
||||
def encode(tokens: Iterable[str], stoi: dict[str, int]) -> list[int]:
|
||||
"""Convert tokens to integer IDs, using ``<UNK>`` for unseen tokens."""
|
||||
unk_id = stoi["<UNK>"]
|
||||
return [stoi.get(token, unk_id) for token in tokens]
|
||||
|
||||
|
||||
def decode(ids: Iterable[int], itos: dict[int, str]) -> list[str]:
|
||||
"""Convert integer IDs back to token strings."""
|
||||
return [itos.get(int(idx), "<UNK>") for idx in ids]
|
||||
|
||||
|
||||
@@ -260,6 +304,12 @@ def build_token_metadata(
|
||||
placement_lookup: dict[tuple[str, int], dict],
|
||||
configs_by_prefix: dict[str, BoardConfig],
|
||||
) -> pd.DataFrame:
|
||||
"""Build per-token metadata used for coordinate features and plotting.
|
||||
|
||||
Hold tokens receive raw coordinates, normalized coordinates in ``[-1, 1]``,
|
||||
role labels, and board identity. Non-hold tokens keep neutral coordinate
|
||||
features so the grade predictor can safely index every token ID.
|
||||
"""
|
||||
bounds = {}
|
||||
for board_key, frame in df_placements.groupby("board_key"):
|
||||
xs = frame["x"].astype(float)
|
||||
@@ -272,6 +322,7 @@ def build_token_metadata(
|
||||
}
|
||||
|
||||
def normalize(value: float, lo: float, hi: float) -> float:
|
||||
"""Scale one coordinate into ``[-1, 1]`` with safe missing-value handling."""
|
||||
if pd.isna(value) or hi == lo:
|
||||
return 0.0
|
||||
return 2 * ((float(value) - lo) / (hi - lo)) - 1
|
||||
@@ -353,6 +404,7 @@ def vocab_payload(
|
||||
itos: dict[int, str],
|
||||
configs_by_key: dict[str, BoardConfig],
|
||||
) -> dict:
|
||||
"""Package vocabulary and board metadata for JSON serialization."""
|
||||
return {
|
||||
"stoi": stoi,
|
||||
"itos": {str(k): v for k, v in itos.items()},
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
"""Small shared utilities for reproducibility, JSON output, and data splits."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
@@ -11,6 +12,7 @@ from sklearn.model_selection import train_test_split
|
||||
|
||||
|
||||
def set_seed(seed: int) -> None:
|
||||
"""Seed Python, NumPy, and PyTorch when PyTorch is installed."""
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
try:
|
||||
@@ -23,6 +25,7 @@ def set_seed(seed: int) -> None:
|
||||
|
||||
|
||||
def json_safe(obj: Any) -> Any:
|
||||
"""Convert NumPy/pandas values into JSON-serializable Python objects."""
|
||||
if isinstance(obj, dict):
|
||||
return {str(k): json_safe(v) for k, v in obj.items()}
|
||||
if isinstance(obj, (list, tuple)):
|
||||
@@ -44,6 +47,7 @@ def json_safe(obj: Any) -> Any:
|
||||
|
||||
|
||||
def write_json(path: str | Path, payload: Any) -> None:
|
||||
"""Write an object as indented UTF-8 JSON after ``json_safe`` cleanup."""
|
||||
path = Path(path)
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(json.dumps(json_safe(payload), indent=2), encoding="utf-8")
|
||||
@@ -55,6 +59,12 @@ def safe_train_test_split(
|
||||
random_state: int,
|
||||
stratify_col: str | None = None,
|
||||
):
|
||||
"""Split a DataFrame with optional stratification and graceful fallback.
|
||||
|
||||
scikit-learn raises when a requested stratum is too small. The tokenization
|
||||
pipeline prefers stratified splits when possible, but falls back to an
|
||||
unstratified split rather than failing on tiny smoke-test subsets.
|
||||
"""
|
||||
stratify = None
|
||||
if stratify_col is not None and stratify_col in df.columns:
|
||||
counts = df[stratify_col].value_counts()
|
||||
@@ -110,6 +120,7 @@ def assign_group_splits(
|
||||
)
|
||||
|
||||
def key_frame(frame: pd.DataFrame) -> set[tuple]:
|
||||
"""Return stringified group keys so pandas dtypes cannot affect joins."""
|
||||
return set(map(tuple, frame[group_cols].astype(str).values.tolist()))
|
||||
|
||||
train_keys = key_frame(train_groups)
|
||||
@@ -117,6 +128,7 @@ def assign_group_splits(
|
||||
test_keys = key_frame(test_groups)
|
||||
|
||||
def split_for_row(row) -> str:
|
||||
"""Map one original row back to its group-level split assignment."""
|
||||
key = tuple(str(row[col]) for col in group_cols)
|
||||
if key in train_keys:
|
||||
return "train"
|
||||
|
||||
@@ -98,6 +98,7 @@ def board_canvas_settings(board_key: str, df_token_meta: pd.DataFrame | None = N
|
||||
|
||||
|
||||
def _board_holds(df_token_meta: pd.DataFrame, board_key: str) -> pd.DataFrame:
|
||||
"""Return one metadata row per plotted hold for a board."""
|
||||
holds = df_token_meta[
|
||||
(df_token_meta["kind"] == "hold")
|
||||
& (df_token_meta["board_key"].astype(str) == str(board_key))
|
||||
@@ -118,6 +119,7 @@ def _route_with_coords(
|
||||
df_token_meta: pd.DataFrame,
|
||||
board_key: str,
|
||||
) -> pd.DataFrame:
|
||||
"""Attach x/y coordinates to route hold records using token metadata."""
|
||||
holds = _board_holds(df_token_meta, board_key)
|
||||
coords = holds[["board_key", "board_token_prefix", "placement_id", "x", "y"]].drop_duplicates(
|
||||
["board_key", "placement_id"]
|
||||
|
||||
Reference in New Issue
Block a user