Add web demo polish and smoke-test pipeline

This commit is contained in:
Pawel
2026-05-24 20:00:40 -04:00
parent 2391c80003
commit bbf276d642
22 changed files with 614 additions and 306 deletions

View File

@@ -0,0 +1,33 @@
from __future__ import annotations
from pathlib import Path
from typing import Any
import torch
def load_checkpoint(
checkpoint_path: str | Path,
map_location: str | torch.device,
*,
trusted: bool = False,
) -> dict[str, Any]:
"""Load a PyTorch checkpoint, preferring safer weights-only loading.
Set ``trusted=True`` only for checkpoints produced by this project or an
otherwise trusted source. Older PyTorch versions do not support
``weights_only``; those fall back to the legacy loader for compatibility.
"""
checkpoint_path = Path(checkpoint_path)
try:
return torch.load(checkpoint_path, map_location=map_location, weights_only=True)
except TypeError:
return torch.load(checkpoint_path, map_location=map_location)
except Exception as exc:
if not trusted:
raise RuntimeError(
"Could not load checkpoint with weights_only=True. "
"Only retry with trusted=True for checkpoints from a trusted source."
) from exc
return torch.load(checkpoint_path, map_location=map_location, weights_only=False)

View File

@@ -135,12 +135,14 @@ def build_placements_query(config: BoardConfig) -> tuple[str, list]:
def load_board_data(
config: BoardConfig,
project_root: str | Path | None = None,
max_climbs: int | None = None,
) -> tuple[pd.DataFrame, pd.DataFrame]:
"""Load climbs and placements data for a single board.
Args:
config: Board configuration
project_root: Path to project root (for resolving db_path)
max_climbs: Optional row limit for fast smoke-test loads.
Returns:
Tuple of (climbs DataFrame, placements DataFrame)
@@ -154,6 +156,11 @@ def load_board_data(
climbs_query, climbs_params = build_climbs_query(config)
placements_query, placements_params = build_placements_query(config)
if max_climbs is not None:
if max_climbs < 1:
raise ValueError("max_climbs must be at least 1.")
climbs_query = f"{climbs_query}\nORDER BY c.uuid, cs.angle\nLIMIT ?"
climbs_params = [*climbs_params, int(max_climbs)]
with sqlite3.connect(db_path) as conn:
df_climbs = pd.read_sql_query(climbs_query, conn, params=climbs_params)
@@ -174,6 +181,7 @@ def load_board_data(
def load_multi_board_data(
configs: list[BoardConfig],
project_root: str | Path | None = None,
max_climbs_per_board: int | None = None,
) -> tuple[pd.DataFrame, pd.DataFrame]:
"""Load and concatenate data from multiple boards.
@@ -184,6 +192,7 @@ def load_multi_board_data(
Args:
configs: List of board configurations
project_root: Path to project root
max_climbs_per_board: Optional row limit per board for smoke tests.
Returns:
Tuple of (combined climbs DataFrame, combined placements DataFrame)
@@ -192,11 +201,15 @@ def load_multi_board_data(
placement_frames = []
for config in configs:
climbs, placements = load_board_data(config, project_root=project_root)
climbs, placements = load_board_data(
config,
project_root=project_root,
max_climbs=max_climbs_per_board,
)
climb_frames.append(climbs)
placement_frames.append(placements)
return (
pd.concat(climb_frames, ignore_index=True),
pd.concat(placement_frames, ignore_index=True),
)
)

View File

@@ -6,6 +6,7 @@ from torch.utils.data import Dataset
class RouteGradeDataset(Dataset):
def __init__(self, df, max_len: int, pad_id: int):
self.row_ids = df["row_id"].tolist() if "row_id" in df.columns else df.index.tolist()
self.ids = df["model_ids"].tolist()
self.targets = df["display_difficulty"].astype(float).values
self.uuids = df["uuid"].tolist()
@@ -28,6 +29,7 @@ class RouteGradeDataset(Dataset):
"input_ids": torch.tensor(ids, dtype=torch.long),
"attention_mask": torch.tensor(mask, dtype=torch.bool),
"target": torch.tensor(self.targets[idx], dtype=torch.float32),
"row_id": int(self.row_ids[idx]),
"uuid": self.uuids[idx],
"board_key": self.boards[idx],
}

View File

@@ -1,6 +1,5 @@
from __future__ import annotations
import ast
import re
from typing import Iterable
@@ -8,38 +7,11 @@ import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist
HOLD_TOKEN_PATTERN = re.compile(r"^<([A-Z0-9_]+)_p(\d+)_(start|middle|finish|foot|unknown)>$")
from .tokenization import parse_tokens, tokens_to_hold_records
def parse_token_list(value) -> list[str]:
if isinstance(value, list):
return value
if not isinstance(value, str):
return []
try:
parsed = ast.literal_eval(value)
if isinstance(parsed, list):
return parsed
except Exception:
pass
return value.split()
def tokens_to_hold_records(tokens: Iterable[str]) -> list[dict[str, object]]:
rows = []
for token in tokens:
match = HOLD_TOKEN_PATTERN.match(token)
if match is None:
continue
rows.append(
{
"token": token,
"board_token_prefix": match.group(1),
"placement_id": int(match.group(2)),
"role": match.group(3),
}
)
return rows
return parse_tokens(value)
def validity_from_records(records: list[dict[str, object]], requested_board_prefix: str | None = None) -> dict[str, object]:
@@ -102,30 +74,30 @@ def nearest_real_route_same_board(
real_df: pd.DataFrame,
) -> dict[str, object]:
board_frame = real_df[real_df["board_key"] == generated_board_key]
best = {
"nearest_real_jaccard": -1.0,
"nearest_real_uuid": None,
"nearest_real_name": None,
"nearest_real_grouped_v": None,
"nearest_real_angle": None,
if board_frame.empty:
return {
"nearest_real_jaccard": np.nan,
"nearest_real_uuid": None,
"nearest_real_name": None,
"nearest_real_grouped_v": None,
"nearest_real_angle": None,
"novelty_distance": np.nan,
}
similarities = board_frame["hold_set"].map(lambda hold_set: jaccard(generated_set, hold_set))
best_idx = similarities.idxmax()
row = board_frame.loc[best_idx]
nearest_real_jaccard = float(similarities.loc[best_idx])
return {
"nearest_real_jaccard": nearest_real_jaccard,
"nearest_real_uuid": row["uuid"],
"nearest_real_name": row["climb_name"],
"nearest_real_grouped_v": row["grouped_v"],
"nearest_real_angle": row["angle"],
"novelty_distance": 1.0 - nearest_real_jaccard,
}
for _, row in board_frame.iterrows():
similarity = jaccard(generated_set, row["hold_set"])
if similarity > best["nearest_real_jaccard"]:
best.update(
{
"nearest_real_jaccard": similarity,
"nearest_real_uuid": row["uuid"],
"nearest_real_name": row["climb_name"],
"nearest_real_grouped_v": row["grouped_v"],
"nearest_real_angle": row["angle"],
}
)
best["novelty_distance"] = 1.0 - float(best["nearest_real_jaccard"])
return best
def build_placement_coords(df_token_meta: pd.DataFrame) -> dict[tuple[str, int], dict[str, float]]:
hold_meta = df_token_meta[df_token_meta["kind"] == "hold"].dropna(subset=["placement_id"]).copy()

View File

@@ -1,12 +1,11 @@
from __future__ import annotations
import re
from typing import Iterable
import torch
import torch.nn.functional as F
HOLD_TOKEN_PATTERN = re.compile(r"^<([A-Z0-9_]+)_p(\d+)_(start|middle|finish|foot|unknown)>$")
from .tokenization import tokens_to_hold_records
def top_k_filter(logits: torch.Tensor, k: int | None) -> torch.Tensor:
@@ -61,20 +60,7 @@ def prompt_tokens(board_prefix: str, angle: int, grouped_v: int) -> list[str]:
def hold_records(tokens: Iterable[str]) -> list[dict[str, object]]:
rows = []
for token in tokens:
match = HOLD_TOKEN_PATTERN.match(token)
if match is None:
continue
rows.append(
{
"board_prefix": match.group(1),
"placement_id": int(match.group(2)),
"role": match.group(3),
"token": token,
}
)
return rows
return tokens_to_hold_records(tokens)
def validity_summary(tokens: Iterable[str], requested_board_prefix: str | None = None) -> dict[str, object]:

View File

@@ -10,6 +10,7 @@ from pathlib import Path
import torch
from .checkpoints import load_checkpoint
from .config import BoardConfig, load_board_config
from .generation import generate_one
from .grades import to_grouped_v
@@ -75,10 +76,7 @@ def load_grade_predictor(
resolved_device = torch.device(device or ("cuda" if torch.cuda.is_available() else "cpu"))
try:
checkpoint = torch.load(checkpoint_path, map_location=resolved_device, weights_only=False)
except TypeError:
checkpoint = torch.load(checkpoint_path, map_location=resolved_device)
checkpoint = load_checkpoint(checkpoint_path, map_location=resolved_device, trusted=True)
cfg = checkpoint["config"]
stoi = {str(k): int(v) for k, v in checkpoint["stoi"].items()}
@@ -176,10 +174,7 @@ def load_route_generator(
resolved_device = torch.device(device or ("cuda" if torch.cuda.is_available() else "cpu"))
try:
checkpoint = torch.load(checkpoint_path, map_location=resolved_device, weights_only=False)
except TypeError:
checkpoint = torch.load(checkpoint_path, map_location=resolved_device)
checkpoint = load_checkpoint(checkpoint_path, map_location=resolved_device, trusted=True)
cfg = checkpoint["config"]
stoi = {str(k): int(v) for k, v in checkpoint["stoi"].items()}
@@ -332,4 +327,3 @@ def predict_frames_grade(
"requested_angle": int(angle),
"frames": frames,
}

View File

@@ -1,6 +1,7 @@
from __future__ import annotations
import re
import ast
from typing import Iterable
import numpy as np
@@ -39,6 +40,43 @@ def parse_frames(frames_str: str | None) -> list[tuple[int, int]]:
return [(int(placement_id), int(role_id)) for placement_id, role_id in matches]
def parse_tokens(value) -> list[str]:
"""Parse tokens from a list, repr-style list string, or whitespace sequence."""
if isinstance(value, list):
return [str(v) for v in value]
if not isinstance(value, str):
return []
try:
parsed = ast.literal_eval(value)
if isinstance(parsed, list):
return [str(v) for v in parsed]
except Exception:
pass
return value.split()
def tokens_to_hold_records(tokens: Iterable[str]) -> list[dict[str, object]]:
"""Extract hold records from model tokens using the shared hold-token grammar."""
rows: list[dict[str, object]] = []
for token in tokens:
match = HOLD_TOKEN_PATTERN.match(str(token))
if match is None:
continue
board_prefix = match.group(1)
rows.append(
{
"token": str(token),
"board_token_prefix": board_prefix,
"board_prefix": board_prefix,
"placement_id": int(match.group(2)),
"role": match.group(3),
}
)
return rows
def make_placement_lookup(df_placements: pd.DataFrame) -> dict[tuple[str, int], dict]:
rows = {}
for _, row in df_placements.iterrows():

View File

@@ -94,7 +94,6 @@ def assign_group_splits(
``train``, ``val``, or ``test``.
"""
group_df = df[group_cols + ([stratify_col] if stratify_col else [])].copy()
group_df["__row_index"] = range(len(group_df))
group_df = group_df.drop_duplicates(group_cols).reset_index(drop=True)
train_groups, temp_groups = safe_train_test_split(

View File

@@ -6,15 +6,13 @@ coordinate extent, then scatter route holds in board coordinates.
"""
from __future__ import annotations
import ast
import re
from pathlib import Path
from typing import Iterable
import matplotlib.pyplot as plt
import pandas as pd
HOLD_TOKEN_PATTERN = re.compile(r"^<([A-Z0-9_]+)_p(\d+)_(start|middle|finish|foot|unknown)>$")
from .tokenization import parse_tokens, tokens_to_hold_records
# These are the same coordinate windows used in the earlier visualization
# notebooks. They come from the product size geometry rather than from the
@@ -58,39 +56,9 @@ ROLE_SIZES = {
}
def parse_tokens(value) -> list[str]:
"""Parse a generated token list from a list, repr string, or sequence string."""
if isinstance(value, list):
return [str(v) for v in value]
if not isinstance(value, str):
return []
try:
parsed = ast.literal_eval(value)
if isinstance(parsed, list):
return [str(v) for v in parsed]
except Exception:
pass
return value.split()
def tokens_to_route_records(tokens: Iterable[str]) -> pd.DataFrame:
"""Extract generated hold records from model tokens."""
rows = []
for token in tokens:
match = HOLD_TOKEN_PATTERN.match(str(token))
if match is None:
continue
rows.append(
{
"token": token,
"board_token_prefix": match.group(1),
"placement_id": int(match.group(2)),
"role": match.group(3),
}
)
return pd.DataFrame(rows)
return pd.DataFrame(tokens_to_hold_records(tokens))
def load_token_metadata(tokenized_dir: str | Path) -> pd.DataFrame: