Added & fixed some documentation

This commit is contained in:
2026-05-29 14:20:22 -04:00
parent 528ea9058f
commit 69389d1ebf
22 changed files with 289 additions and 2 deletions

View File

@@ -1,3 +1,4 @@
"""PyTorch dataset adapters for tokenized climbing-board routes."""
from __future__ import annotations
import torch
@@ -5,7 +6,15 @@ from torch.utils.data import Dataset
class RouteGradeDataset(Dataset):
"""Dataset for transformer encoder grade prediction.
Each item returns a padded token sequence, a boolean attention mask, the
continuous display-difficulty target, and a small amount of route identity
metadata used when writing prediction CSVs.
"""
def __init__(self, df, max_len: int, pad_id: int):
"""Store model IDs and labels from a tokenized route DataFrame."""
self.row_ids = df["row_id"].tolist() if "row_id" in df.columns else df.index.tolist()
self.ids = df["model_ids"].tolist()
self.targets = df["display_difficulty"].astype(float).values
@@ -15,9 +24,11 @@ class RouteGradeDataset(Dataset):
self.pad_id = int(pad_id)
def __len__(self) -> int:
"""Return the number of route examples."""
return len(self.ids)
def __getitem__(self, idx: int):
"""Return one padded encoder example and its regression target."""
ids = list(self.ids[idx])[: self.max_len]
mask = [1] * len(ids)
if len(ids) < self.max_len:
@@ -36,15 +47,25 @@ class RouteGradeDataset(Dataset):
class RouteGPTDataset(Dataset):
"""Dataset for causal next-token route generation.
The full sequence is padded once, then split into ``input_ids`` and
``target_ids`` shifted by one position for teacher-forced language-model
training.
"""
def __init__(self, df, max_len: int, pad_id: int):
"""Store GPT token ID sequences from a tokenized route DataFrame."""
self.ids = df["gpt_ids"].tolist()
self.max_len = int(max_len)
self.pad_id = int(pad_id)
def __len__(self) -> int:
"""Return the number of route examples."""
return len(self.ids)
def __getitem__(self, idx: int):
"""Return one padded causal-language-model training example."""
ids = list(self.ids[idx])[: self.max_len]
if len(ids) < self.max_len:
ids += [self.pad_id] * (self.max_len - len(ids))

View File

@@ -1,3 +1,9 @@
"""Evaluation utilities for generated climbing-board routes.
The helpers in this module are intentionally model-agnostic: they work from
tokens, frames strings, and token metadata so notebooks, scripts, and tests can
reuse the same route validity, novelty, and geometry calculations.
"""
from __future__ import annotations
import re
@@ -11,10 +17,12 @@ from .tokenization import parse_tokens, tokens_to_hold_records
def parse_token_list(value) -> list[str]:
"""Compatibility wrapper around the shared token parser."""
return parse_tokens(value)
def validity_from_records(records: list[dict[str, object]], requested_board_prefix: str | None = None) -> dict[str, object]:
"""Compute evaluation-specific route-validity flags from hold records."""
placements = [int(record["placement_id"]) for record in records]
roles = [str(record["role"]) for record in records]
prefixes = [str(record["board_token_prefix"]) for record in records]
@@ -51,16 +59,19 @@ def validity_from_records(records: list[dict[str, object]], requested_board_pref
def frames_to_holds(frames: str | None) -> list[tuple[int, int]]:
"""Parse a frames string into ``(placement_id, role_id)`` pairs."""
if not isinstance(frames, str):
return []
return [(int(p), int(r)) for p, r in re.findall(r"p(\d+)r(\d+)", frames)]
def holds_to_placement_set(holds: Iterable[tuple[int, int]]) -> frozenset[int]:
"""Drop role IDs and represent a route by its unique placement IDs."""
return frozenset(int(placement_id) for placement_id, _ in holds)
def jaccard(a: frozenset[int], b: frozenset[int]) -> float:
"""Return Jaccard similarity between two placement sets."""
if not a and not b:
return 1.0
if not a or not b:
@@ -73,6 +84,7 @@ def nearest_real_route_same_board(
generated_board_key: str,
real_df: pd.DataFrame,
) -> dict[str, object]:
"""Find the most similar real route on the same board by Jaccard score."""
board_frame = real_df[real_df["board_key"] == generated_board_key]
if board_frame.empty:
return {
@@ -100,6 +112,7 @@ def nearest_real_route_same_board(
def build_placement_coords(df_token_meta: pd.DataFrame) -> dict[tuple[str, int], dict[str, float]]:
"""Build a placement-coordinate lookup from token metadata."""
hold_meta = df_token_meta[df_token_meta["kind"] == "hold"].dropna(subset=["placement_id"]).copy()
coords = {}
for _, row in hold_meta.drop_duplicates(["board_key", "placement_id"]).iterrows():
@@ -116,6 +129,12 @@ def simple_route_features(
records: list[dict[str, object]],
placement_coords: dict[tuple[str, int], dict[str, float]],
) -> dict[str, float]:
"""Compute simple geometric route features from hold coordinates.
These features are descriptive rather than a full climbing-physics model:
height/width describe route spread, and hand-reach distances summarize the
pairwise spacing among start/middle/finish holds.
"""
rows = []
for record in records:
key = (str(board_key), int(record["placement_id"]))

View File

@@ -1,3 +1,4 @@
"""Sampling and structural-validity helpers for route generation."""
from __future__ import annotations
from typing import Iterable
@@ -9,6 +10,7 @@ from .tokenization import tokens_to_hold_records
def top_k_filter(logits: torch.Tensor, k: int | None) -> torch.Tensor:
"""Mask logits outside the top ``k`` choices for each batch row."""
if k is None or k <= 0 or k >= logits.size(-1):
return logits
values, _ = torch.topk(logits, k)
@@ -27,6 +29,11 @@ def sample_ids(
eos_id: int | None = None,
forbidden_ids: Iterable[int] | None = None,
) -> list[int]:
"""Autoregressively sample token IDs from a trained route generator.
The returned list includes the prompt IDs and all sampled IDs up to either
``max_new_tokens`` or the first sampled ``eos_id``.
"""
model.eval()
sequence = torch.tensor([prompt_ids], dtype=torch.long, device=device)
forbidden_ids = set(forbidden_ids or [])
@@ -36,6 +43,8 @@ def sample_ids(
logits, _ = model(idx_cond)
logits = logits[:, -1, :] / max(temperature, 1e-6)
# Special tokens like <PAD> and <CLS> are valid vocabulary entries but
# should never be emitted in the middle of a generated climb.
for token_id in forbidden_ids:
logits[:, int(token_id)] = -float("inf")
@@ -51,6 +60,7 @@ def sample_ids(
def prompt_tokens(board_prefix: str, angle: int, grouped_v: int) -> list[str]:
"""Build the conditioning prefix used before sampling hold tokens."""
return [
"<BOS>",
f"<BOARD_{board_prefix}>",
@@ -60,10 +70,12 @@ def prompt_tokens(board_prefix: str, angle: int, grouped_v: int) -> list[str]:
def hold_records(tokens: Iterable[str]) -> list[dict[str, object]]:
"""Extract hold records from generated tokens."""
return tokens_to_hold_records(tokens)
def validity_summary(tokens: Iterable[str], requested_board_prefix: str | None = None) -> dict[str, object]:
"""Summarize basic structural validity for generated token sequences."""
records = hold_records(tokens)
placements = [record["placement_id"] for record in records]
roles = [record["role"] for record in records]
@@ -94,6 +106,11 @@ def validity_summary(tokens: Iterable[str], requested_board_prefix: str | None =
def generated_tokens_to_frames(tokens: Iterable[str], role_name_to_id: dict[str, int], board_prefix: str | None = None) -> str:
"""Convert generated hold tokens back into a frames string.
Duplicate placements and unknown roles are skipped, matching the forgiving
cleanup used by the demo scripts and webapp.
"""
pieces = []
seen = set()
for record in hold_records(tokens):
@@ -121,6 +138,7 @@ def generate_one(
top_k: int | None = 50,
max_new_tokens: int = 40,
) -> dict[str, object]:
"""Generate one route and return tokens, frames, request metadata, validity."""
unk_id = stoi["<UNK>"]
eos_id = stoi["<EOS>"]
forbidden_ids = [

View File

@@ -1,5 +1,8 @@
"""Grade-scale helpers for BoardLib display difficulty and grouped V grades."""
from __future__ import annotations
# BoardLib display difficulties are integer-like values. This project groups
# them into V grades so TB2 and Kilter can share a compact grade-token space.
GRADE_TO_V = {
10: 0, 11: 0, 12: 0,
13: 1, 14: 1,
@@ -22,10 +25,12 @@ GRADE_TO_V = {
def to_grouped_v(display_difficulty: float) -> int:
"""Map a continuous display difficulty to the nearest grouped V grade."""
rounded = int(round(float(display_difficulty)))
rounded = max(min(rounded, max(GRADE_TO_V)), min(GRADE_TO_V))
return GRADE_TO_V[rounded]
def grade_token(display_difficulty: float) -> str:
"""Return the grade-conditioning token for a display difficulty value."""
return f"<GRADE_V{to_grouped_v(display_difficulty)}>"

View File

@@ -1,3 +1,4 @@
"""Metrics used to evaluate continuous grade predictions."""
from __future__ import annotations
import math
@@ -10,6 +11,7 @@ from .grades import to_grouped_v
def regression_metrics(y_true, y_pred) -> dict[str, float]:
"""Compute difficulty-scale and grouped-V-grade prediction metrics."""
y_true = np.asarray(y_true)
y_pred = np.asarray(y_pred)
true_v = np.asarray([to_grouped_v(x) for x in y_true])
@@ -28,6 +30,7 @@ def regression_metrics(y_true, y_pred) -> dict[str, float]:
def metrics_by_board(pred_df: pd.DataFrame) -> pd.DataFrame:
"""Compute regression metrics separately for each board in a prediction table."""
rows = []
for board_key, frame in pred_df.groupby("board_key"):
metrics = regression_metrics(frame["y_true"].values, frame["y_pred"].values)
@@ -36,6 +39,7 @@ def metrics_by_board(pred_df: pd.DataFrame) -> pd.DataFrame:
def print_metrics(name: str, metrics: dict[str, float]) -> None:
"""Pretty-print a metric dictionary in the training scripts."""
print(name)
print("-" * len(name))
for key, value in metrics.items():

View File

@@ -1,3 +1,4 @@
"""Neural network definitions for grade prediction and route generation."""
from __future__ import annotations
import torch
@@ -6,7 +7,13 @@ import torch.nn.functional as F
class JointRouteTransformerRegressor(nn.Module):
"""Transformer encoder for joint TB2/Kilter route difficulty prediction."""
"""Transformer encoder for joint TB2/Kilter route difficulty prediction.
Inputs are token IDs plus an attention mask. Token, position, and learned
projections of coordinate metadata are added before the encoder. The first
``<CLS>`` position is then used as a pooled route representation for scalar
difficulty regression.
"""
def __init__(
self,
@@ -20,6 +27,7 @@ class JointRouteTransformerRegressor(nn.Module):
dropout: float = 0.10,
pad_id: int = 0,
):
"""Create the encoder, coordinate projection, and regression head."""
super().__init__()
self.vocab_size = vocab_size
self.max_len = max_len
@@ -55,9 +63,12 @@ class JointRouteTransformerRegressor(nn.Module):
)
def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
"""Return one continuous difficulty prediction per input sequence."""
batch_size, seq_len = input_ids.shape
positions = torch.arange(seq_len, device=input_ids.device).unsqueeze(0).expand(batch_size, seq_len)
# Coordinate features are indexed by token ID, so every occurrence of a
# hold token gets the same physical x/y hint wherever it appears.
x = self.token_emb(input_ids) + self.pos_emb(positions)
x = x + self.coord_proj(self.coord_features[input_ids])
@@ -70,7 +81,11 @@ class JointRouteTransformerRegressor(nn.Module):
class JointRouteGPT(nn.Module):
"""Tiny GPT-style causal transformer for board-conditioned route generation."""
"""Tiny GPT-style causal transformer for board-conditioned route generation.
PyTorch's ``TransformerEncoder`` is used with a causal mask, which makes it
behave like a decoder-only language model for short route sequences.
"""
def __init__(
self,
@@ -82,6 +97,7 @@ class JointRouteGPT(nn.Module):
dropout: float = 0.10,
pad_id: int = 0,
):
"""Create the token/position embeddings, causal blocks, and LM head."""
super().__init__()
self.vocab_size = vocab_size
self.block_size = block_size
@@ -114,6 +130,7 @@ class JointRouteGPT(nn.Module):
idx: torch.Tensor,
targets: torch.Tensor | None = None,
) -> tuple[torch.Tensor, torch.Tensor | None]:
"""Return next-token logits and, when targets are supplied, CE loss."""
_, seq_len = idx.shape
if seq_len > self.block_size:
idx = idx[:, -self.block_size :]
@@ -126,6 +143,8 @@ class JointRouteGPT(nn.Module):
torch.ones(seq_len, seq_len, device=idx.device, dtype=torch.bool),
diagonal=1,
)
# Padding masks suppress attention to right-padded context tokens while
# the causal mask suppresses attention to future positions.
key_padding_mask = idx.eq(self.pad_id)
h = self.blocks(

View File

@@ -1,9 +1,16 @@
"""Path discovery helpers for scripts that can be launched from any directory."""
from __future__ import annotations
from pathlib import Path
def find_project_root(start: str | Path | None = None) -> Path:
"""Walk upward until the repository root markers are found.
The project root is identified by both ``pyproject.toml`` and ``configs``.
If neither marker pair is found, the resolved starting directory is returned
so callers still have a deterministic base path.
"""
current = Path(start).resolve() if start is not None else Path.cwd().resolve()
for candidate in [current, *current.parents]:
if (candidate / "pyproject.toml").exists() and (candidate / "configs").exists():

View File

@@ -1,3 +1,10 @@
"""Route tokenization helpers shared by training, evaluation, and demos.
The project represents every climb as a short symbolic sequence. Board,
angle, grade, and hold-role information are all encoded as tokens, while hold
tokens are namespaced by board so placement IDs from different products cannot
collide.
"""
from __future__ import annotations
import re
@@ -19,6 +26,8 @@ SPECIAL_TOKENS = [
"<MASK>",
]
# The token grammar is intentionally centralized here so training, generation,
# evaluation, and the webapp parse the same strings in the same way.
ANGLE_TOKEN_PATTERN = re.compile(r"^<ANGLE_(-?\d+)>$")
GRADE_TOKEN_PATTERN = re.compile(r"^<GRADE_V(\d+)>$")
BOARD_TOKEN_PATTERN = re.compile(r"^<BOARD_([A-Z0-9_]+)>$")
@@ -34,6 +43,12 @@ ROLE_SORT_ORDER = {
def parse_frames(frames_str: str | None) -> list[tuple[int, int]]:
"""Parse a frames string into ``(placement_id, role_id)`` pairs.
Frames strings are compact concatenations such as ``p344r5p369r6``. Invalid
or missing input returns an empty list so callers can skip unusable climbs
without special-case exception handling.
"""
if not isinstance(frames_str, str):
return []
matches = re.findall(r"p(\d+)r(\d+)", frames_str)
@@ -78,6 +93,7 @@ def tokens_to_hold_records(tokens: Iterable[str]) -> list[dict[str, object]]:
def make_placement_lookup(df_placements: pd.DataFrame) -> dict[tuple[str, int], dict]:
"""Build a coordinate/metadata lookup keyed by ``(board_key, placement_id)``."""
rows = {}
for _, row in df_placements.iterrows():
key = (str(row["board_key"]), int(row["placement_id"]))
@@ -86,6 +102,7 @@ def make_placement_lookup(df_placements: pd.DataFrame) -> dict[tuple[str, int],
def role_name(role_id: int, config: BoardConfig) -> str:
"""Map a board-specific numeric role ID to a shared semantic role name."""
return config.role_id_to_name.get(int(role_id), "unknown")
@@ -94,6 +111,7 @@ def placement_xy(
placement_id: int,
placement_lookup: dict[tuple[str, int], dict],
) -> tuple[float, float]:
"""Return raw board coordinates for a placement, or NaNs if unknown."""
row = placement_lookup.get((str(board_key), int(placement_id)))
if row is None:
return (float("nan"), float("nan"))
@@ -105,7 +123,15 @@ def canonicalize_holds(
config: BoardConfig,
placement_lookup: dict[tuple[str, int], dict],
) -> list[tuple[int, int]]:
"""Sort holds into the canonical route order used by all model inputs.
Frames preserve setter/storage order, which is not always stable
across routes or boards. Canonical ordering keeps starts first, hand/foot
holds in a bottom-to-top scan, and finishes last, giving the models a more
consistent sequence grammar.
"""
def key(pair: tuple[int, int]):
"""Sort by semantic role, then board position, then placement ID."""
placement_id, role_id = pair
x, y = placement_xy(config.board_key, placement_id, placement_lookup)
name = role_name(role_id, config)
@@ -120,10 +146,12 @@ def canonicalize_holds(
def board_token(config: BoardConfig) -> str:
"""Return the special conditioning token for a board config."""
return f"<BOARD_{config.token_prefix}>"
def angle_token(angle: float) -> str:
"""Round a wall angle into the shared angle-token format."""
return f"<ANGLE_{int(round(float(angle)))}>"
@@ -132,6 +160,7 @@ def hold_token(
role_id: int,
config: BoardConfig,
) -> str:
"""Return a board-namespaced hold token for a placement and role."""
semantic_role = role_name(role_id, config)
return f"<{config.token_prefix}_p{int(placement_id)}_{semantic_role}>"
@@ -143,6 +172,12 @@ def tokenize_route(
include_grade: bool = True,
canonical: bool = True,
) -> list[str]:
"""Tokenize one climb row into the sequence consumed by the models.
``include_grade=True`` is used for GPT-style generation, where the target
grade is a conditioning token. ``include_grade=False`` is used for grade
prediction so the model cannot read the answer from its input.
"""
holds = parse_frames(row["frames"])
if canonical:
holds = canonicalize_holds(holds, config, placement_lookup)
@@ -165,6 +200,12 @@ def build_route_records(
configs_by_key: dict[str, BoardConfig],
placement_lookup: dict[tuple[str, int], dict],
) -> pd.DataFrame:
"""Create one training/evaluation record per climb-angle row.
The returned frame keeps both human-readable route metadata and model-ready
token sequences, which lets downstream scripts save compact CSV summaries
while still retaining the richer JSONL training artifacts.
"""
records: list[dict] = []
for _, row in df_climbs.iterrows():
@@ -230,6 +271,7 @@ def build_route_records(
def build_vocab(df_routes: pd.DataFrame) -> tuple[list[str], dict[str, int], dict[int, str]]:
"""Build the shared token vocabulary from grade-conditioned sequences."""
all_tokens: list[str] = []
for tokens in df_routes["tokens_with_grade"]:
all_tokens.extend(tokens)
@@ -245,11 +287,13 @@ def build_vocab(df_routes: pd.DataFrame) -> tuple[list[str], dict[str, int], dic
def encode(tokens: Iterable[str], stoi: dict[str, int]) -> list[int]:
"""Convert tokens to integer IDs, using ``<UNK>`` for unseen tokens."""
unk_id = stoi["<UNK>"]
return [stoi.get(token, unk_id) for token in tokens]
def decode(ids: Iterable[int], itos: dict[int, str]) -> list[str]:
"""Convert integer IDs back to token strings."""
return [itos.get(int(idx), "<UNK>") for idx in ids]
@@ -260,6 +304,12 @@ def build_token_metadata(
placement_lookup: dict[tuple[str, int], dict],
configs_by_prefix: dict[str, BoardConfig],
) -> pd.DataFrame:
"""Build per-token metadata used for coordinate features and plotting.
Hold tokens receive raw coordinates, normalized coordinates in ``[-1, 1]``,
role labels, and board identity. Non-hold tokens keep neutral coordinate
features so the grade predictor can safely index every token ID.
"""
bounds = {}
for board_key, frame in df_placements.groupby("board_key"):
xs = frame["x"].astype(float)
@@ -272,6 +322,7 @@ def build_token_metadata(
}
def normalize(value: float, lo: float, hi: float) -> float:
"""Scale one coordinate into ``[-1, 1]`` with safe missing-value handling."""
if pd.isna(value) or hi == lo:
return 0.0
return 2 * ((float(value) - lo) / (hi - lo)) - 1
@@ -353,6 +404,7 @@ def vocab_payload(
itos: dict[int, str],
configs_by_key: dict[str, BoardConfig],
) -> dict:
"""Package vocabulary and board metadata for JSON serialization."""
return {
"stoi": stoi,
"itos": {str(k): v for k, v in itos.items()},

View File

@@ -1,3 +1,4 @@
"""Small shared utilities for reproducibility, JSON output, and data splits."""
from __future__ import annotations
import json
@@ -11,6 +12,7 @@ from sklearn.model_selection import train_test_split
def set_seed(seed: int) -> None:
"""Seed Python, NumPy, and PyTorch when PyTorch is installed."""
random.seed(seed)
np.random.seed(seed)
try:
@@ -23,6 +25,7 @@ def set_seed(seed: int) -> None:
def json_safe(obj: Any) -> Any:
"""Convert NumPy/pandas values into JSON-serializable Python objects."""
if isinstance(obj, dict):
return {str(k): json_safe(v) for k, v in obj.items()}
if isinstance(obj, (list, tuple)):
@@ -44,6 +47,7 @@ def json_safe(obj: Any) -> Any:
def write_json(path: str | Path, payload: Any) -> None:
"""Write an object as indented UTF-8 JSON after ``json_safe`` cleanup."""
path = Path(path)
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(json_safe(payload), indent=2), encoding="utf-8")
@@ -55,6 +59,12 @@ def safe_train_test_split(
random_state: int,
stratify_col: str | None = None,
):
"""Split a DataFrame with optional stratification and graceful fallback.
scikit-learn raises when a requested stratum is too small. The tokenization
pipeline prefers stratified splits when possible, but falls back to an
unstratified split rather than failing on tiny smoke-test subsets.
"""
stratify = None
if stratify_col is not None and stratify_col in df.columns:
counts = df[stratify_col].value_counts()
@@ -110,6 +120,7 @@ def assign_group_splits(
)
def key_frame(frame: pd.DataFrame) -> set[tuple]:
"""Return stringified group keys so pandas dtypes cannot affect joins."""
return set(map(tuple, frame[group_cols].astype(str).values.tolist()))
train_keys = key_frame(train_groups)
@@ -117,6 +128,7 @@ def assign_group_splits(
test_keys = key_frame(test_groups)
def split_for_row(row) -> str:
"""Map one original row back to its group-level split assignment."""
key = tuple(str(row[col]) for col in group_cols)
if key in train_keys:
return "train"

View File

@@ -98,6 +98,7 @@ def board_canvas_settings(board_key: str, df_token_meta: pd.DataFrame | None = N
def _board_holds(df_token_meta: pd.DataFrame, board_key: str) -> pd.DataFrame:
"""Return one metadata row per plotted hold for a board."""
holds = df_token_meta[
(df_token_meta["kind"] == "hold")
& (df_token_meta["board_key"].astype(str) == str(board_key))
@@ -118,6 +119,7 @@ def _route_with_coords(
df_token_meta: pd.DataFrame,
board_key: str,
) -> pd.DataFrame:
"""Attach x/y coordinates to route hold records using token metadata."""
holds = _board_holds(df_token_meta, board_key)
coords = holds[["board_key", "board_token_prefix", "placement_id", "x", "y"]].drop_duplicates(
["board_key", "placement_id"]