initial commit

This commit is contained in:
Pawel
2026-05-21 07:21:13 -04:00
commit d510d07ed9
50 changed files with 5359 additions and 0 deletions
+360
View File
@@ -0,0 +1,360 @@
#!/usr/bin/env python3
"""
ClimbingBoardGPT — Route Tokenization Script
This script converts raw climbing route data from SQLite databases into
tokenized sequences suitable for training transformer models.
What is tokenization?
---------------------
In NLP, tokenization converts raw text into discrete symbols (tokens) that
a model can process. For example, GPT-2 uses Byte-Pair Encoding (BPE) to
split "climbing" into ["cl", "imb", "ing"].
For climbing routes, we tokenize differently:
- Each hold on the board becomes a unique token (e.g., <TB2_p344_start>)
- Board identity, angle, and grade become conditioning tokens
- Special tokens mark sequence boundaries (<BOS>, <EOS>, etc.)
The key insight: climbing routes ARE sequences, just like sentences. The
same transformer architectures that learn English grammar can learn "climb
grammar" — which holds tend to follow which, how start holds differ from
finish holds, etc.
This script:
1. Loads board configurations from JSON files
2. Queries SQLite databases for climb and placement data
3. Parses frame strings (e.g., "p344r5p369r6p603r7") into structured data
4. Maps board-specific role IDs to shared semantic roles
5. Canonicalizes hold order (starts first, then middles by Y, etc.)
6. Generates two token sequences per route:
- with_grade: includes <GRADE_V6> for GPT training
- without_grade: excludes grade for BERT-style prediction
7. Builds vocabulary, train/val/test splits, and saves all artifacts
Usage:
python scripts/01_tokenize_routes.py --boards tb2,kilter
python scripts/01_tokenize_routes.py --boards tb2
python scripts/01_tokenize_routes.py --boards kilter
"""
from __future__ import annotations
import argparse
import json
import sys
from pathlib import Path
# Set up the project root so we can import our custom package
REPO_ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(REPO_ROOT / "src"))
import pandas as pd
from climbingboardgpt.config import load_board_configs, parse_board_keys
from climbingboardgpt.data import load_multi_board_data
from climbingboardgpt.tokenization import (
build_route_records,
build_token_metadata,
build_vocab,
encode,
make_placement_lookup,
vocab_payload,
)
from climbingboardgpt.utils import json_safe, safe_train_test_split, set_seed, write_json
def parse_args() -> argparse.Namespace:
"""Parse command-line arguments for the tokenization script.
Key arguments:
--boards: Which boards to tokenize (comma-separated). Default: "tb2,kilter"
--out-dir: Where to save tokenized artifacts
--seed: Random seed for reproducible train/val/test splits
"""
parser = argparse.ArgumentParser(
description="Tokenize TB2 and/or Kilter routes for ClimbingBoardGPT.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Tokenize both boards (default)
python scripts/01_tokenize_routes.py --boards tb2,kilter
# Tokenize only TB2
python scripts/01_tokenize_routes.py --boards tb2
# Custom output directory
python scripts/01_tokenize_routes.py --out-dir /path/to/output
""",
)
parser.add_argument(
"--boards",
type=str,
default="tb2,kilter",
help="Comma-separated board config names (default: tb2,kilter)",
)
parser.add_argument(
"--out-dir",
type=Path,
default=REPO_ROOT / "data" / "processed" / "tokenized",
help="Output directory for tokenized artifacts",
)
parser.add_argument(
"--seed",
type=int,
default=42,
help="Random seed for reproducible splits (default: 42)",
)
return parser.parse_args()
def main() -> None:
"""Main entry point for route tokenization.
This function orchestrates the entire tokenization pipeline:
1. Load board configurations
2. Query databases for raw climb and placement data
3. Parse frames strings into structured hold records
4. Build tokenized route records with canonical hold ordering
5. Construct vocabulary from all unique tokens
6. Split data into train/val/test sets (stratified by board × grade)
7. Build token metadata (coordinates, roles, etc.)
8. Save all artifacts to disk
"""
args = parse_args()
# Set random seed for reproducibility
# This ensures train/val/test splits are the same across runs
set_seed(args.seed)
args.out_dir.mkdir(parents=True, exist_ok=True)
# ─────────────────────────────────────────────────────────────────────
# Step 1: Load board configurations
# ─────────────────────────────────────────────────────────────────────
# Each board has a JSON config file specifying:
# - layout_id: Which layout in the database to use
# - role_definitions: Maps semantic roles (start, middle, etc.) to numeric IDs
# - max_angle: Filter out routes steeper than this
# - token_prefix: Namespace for hold tokens (prevents ID collisions)
#
# This config-driven approach means adding a new board only requires
# creating a new JSON file, not modifying code.
board_keys = parse_board_keys(args.boards)
configs = load_board_configs(board_keys)
configs_by_key = {config.board_key: config for config in configs}
configs_by_prefix = {config.token_prefix: config for config in configs}
print(f"Loaded {len(configs)} board configuration(s):")
for config in configs:
print(f" {config.display_name} (key={config.board_key}, prefix={config.token_prefix})")
print(f" layout_id={config.layout_id}, max_angle={config.max_angle}")
print(f" role_definitions={config.role_definitions}")
# ─────────────────────────────────────────────────────────────────────
# Step 2: Load raw data from SQLite databases
# ─────────────────────────────────────────────────────────────────────
# Each board has its own SQLite database containing:
# - climbs table: route metadata (name, setter, frames string, etc.)
# - climb_stats table: angle, difficulty, ascensionist count, quality
# - placements table: physical hold positions and default roles
# - holes table: (x, y) coordinates for each placement
# - difficulty_grades table: mapping from numeric difficulty to V-grades
#
# The frames string is the core data — it encodes which holds are used
# and their roles, e.g., "p344r5p369r6p603r7" means:
# placement 344 with role 5 (start)
# placement 369 with role 6 (middle)
# placement 603 with role 7 (finish)
print("\nLoading data from databases...")
df_climbs, df_placements = load_multi_board_data(configs, project_root=REPO_ROOT)
placement_lookup = make_placement_lookup(df_placements)
print(f" Total climb-angle entries: {len(df_climbs):,}")
print(f" Total placements: {len(df_placements):,}")
print(f" Per board:")
for board_key in df_climbs["board_key"].unique():
n = (df_climbs["board_key"] == board_key).sum()
print(f" {board_key}: {n:,} entries")
# ─────────────────────────────────────────────────────────────────────
# Step 3: Build tokenized route records
# ─────────────────────────────────────────────────────────────────────
# This is the core tokenization step. For each climb:
# 1. Parse the frames string into (placement_id, role_id) pairs
# 2. Map role IDs to semantic names using board config
# 3. Sort holds canonically: starts first, then middles by Y, etc.
# 4. Generate two token sequences:
# - with_grade: <BOS> <BOARD_X> <ANGLE_Y> <GRADE_VZ> <holds...> <EOS>
# - without_grade: <BOS> <BOARD_X> <ANGLE_Y> <holds...> <EOS>
#
# The grade-included version is for the GPT generator (which conditions
# on grade). The grade-excluded version is for the BERT-style predictor
# (which must predict grade, not see it).
print("\nBuilding tokenized route records...")
df_routes = build_route_records(
df_climbs=df_climbs,
configs_by_key=configs_by_key,
placement_lookup=placement_lookup,
)
if df_routes.empty:
raise RuntimeError("No routes were tokenized. Check raw DBs and board configs.")
print(f" Tokenized routes: {len(df_routes):,}")
print(f" Per board:")
for board_key in df_routes["board_key"].unique():
n = (df_routes["board_key"] == board_key).sum()
print(f" {board_key}: {n:,} routes")
# ─────────────────────────────────────────────────────────────────────
# Step 4: Build the shared vocabulary
# ─────────────────────────────────────────────────────────────────────
# The vocabulary maps each unique token to an integer ID.
# This is analogous to how GPT-2's tokenizer maps subwords to IDs.
#
# Vocabulary structure:
# 1. Special tokens (IDs 0-5): <PAD>, <UNK>, <BOS>, <EOS>, <CLS>, <MASK>
# 2. Board tokens: <BOARD_TB2>, <BOARD_KILTER>
# 3. Angle tokens: <ANGLE_10>, <ANGLE_15>, ..., <ANGLE_55>
# 4. Grade tokens: <GRADE_V0>, <GRADE_V1>, ..., <GRADE_V16>
# 5. Hold tokens: <TB2_p344_start>, <KILTER_p1084_middle>, etc.
#
# Hold tokens are namespaced by board to prevent ID collisions.
# TB2 placement 344 and Kilter placement 344 are different physical holds.
print("\nBuilding vocabulary...")
vocab_tokens, stoi, itos = build_vocab(df_routes)
print(f" Vocabulary size: {len(stoi):,}")
special_count = sum(1 for t in vocab_tokens if t in ["<PAD>", "<UNK>", "<BOS>", "<EOS>", "<CLS>", "<MASK>"])
board_count = sum(1 for t in vocab_tokens if t.startswith("<BOARD_"))
angle_count = sum(1 for t in vocab_tokens if t.startswith("<ANGLE_"))
grade_count = sum(1 for t in vocab_tokens if t.startswith("<GRADE_"))
hold_count = sum(1 for t in vocab_tokens if "_p" in t)
print(f" Special tokens: {special_count}")
print(f" Board tokens: {board_count}")
print(f" Angle tokens: {angle_count}")
print(f" Grade tokens: {grade_count}")
print(f" Hold tokens: {hold_count}")
# ─────────────────────────────────────────────────────────────────────
# Step 5: Encode token sequences as integer IDs
# ─────────────────────────────────────────────────────────────────────
# Convert string tokens to integer IDs for model input.
# This is the same as encoding text with a tokenizer:
# "The cat sat" → [464, 3797, 3290]
# "<BOS> <BOARD_TB2> <TB2_p344_start>" → [2, 6, 42]
df_routes["ids_with_grade"] = df_routes["tokens_with_grade"].apply(lambda tokens: encode(tokens, stoi))
df_routes["ids_no_grade"] = df_routes["tokens_no_grade"].apply(lambda tokens: encode(tokens, stoi))
# ─────────────────────────────────────────────────────────────────────
# Step 6: Train/val/test split (stratified)
# ─────────────────────────────────────────────────────────────────────
# We split 80/10/10, stratified by board_key × grouped_v.
# This ensures both boards and all difficulty levels are represented
# in each split, which is critical for fair evaluation.
#
# Stratification prevents scenarios like "all V14 climbs end up in
# the test set while training has none."
df_routes["split_stratum"] = (
df_routes["board_key"].astype(str)
+ "__V"
+ df_routes["grouped_v"].astype(str)
)
train_df, temp_df = safe_train_test_split(
df_routes,
test_size=0.20,
random_state=args.seed,
stratify_col="split_stratum",
)
val_df, test_df = safe_train_test_split(
temp_df,
test_size=0.50,
random_state=args.seed,
stratify_col="split_stratum",
)
split_map = {}
split_map.update({uuid: "train" for uuid in train_df["uuid"]})
split_map.update({uuid: "val" for uuid in val_df["uuid"]})
split_map.update({uuid: "test" for uuid in test_df["uuid"]})
df_routes["split"] = df_routes["uuid"].map(split_map)
print(f"\nSplit counts:")
print(df_routes.groupby(["board_key", "split"]).size().unstack(fill_value=0))
# ─────────────────────────────────────────────────────────────────────
# Step 7: Build token metadata
# ─────────────────────────────────────────────────────────────────────
# Each token has associated metadata:
# - kind: "special", "board", "angle", "grade", or "hold"
# - For hold tokens: board_key, placement_id, role, x, y, x_norm, y_norm
# - For angle tokens: the angle value
# - For grade tokens: the V-grade value
#
# The coordinate features (x_norm, y_norm, is_hold) are injected into
# the grade predictor model as additional embeddings alongside token
# embeddings. This gives the model direct spatial information.
print("\nBuilding token metadata...")
df_token_meta = build_token_metadata(
vocab_tokens=vocab_tokens,
stoi=stoi,
df_placements=df_placements,
placement_lookup=placement_lookup,
configs_by_prefix=configs_by_prefix,
)
print(f" Token metadata rows: {len(df_token_meta):,}")
# ─────────────────────────────────────────────────────────────────────
# Step 8: Save all artifacts
# ─────────────────────────────────────────────────────────────────────
# Save multiple file formats for different use cases:
# - CSV: Easy to load in pandas for analysis
# - JSONL: Easy to stream for training
# - JSON: Vocabulary mapping for model loading
print("\nSaving artifacts...")
jsonl_path = args.out_dir / "routes_tokenized.jsonl"
with jsonl_path.open("w", encoding="utf-8") as handle:
for record in df_routes.to_dict(orient="records"):
handle.write(json.dumps(json_safe(record)) + "\n")
csv_cols = [
"uuid", "board_key", "board_display_name", "board_token_prefix", "board_token",
"climb_name", "setter_username", "layout_id", "layout_name", "board_name",
"frames", "angle", "display_difficulty", "grouped_v", "boulder_grade",
"ascensionist_count", "quality_average", "fa_at",
"n_holds", "n_start", "n_middle", "n_foot", "n_finish",
"sequence_with_grade", "sequence_no_grade", "split",
]
df_routes[csv_cols].to_csv(args.out_dir / "route_sequences.csv", index=False)
df_placements.to_csv(args.out_dir / "placement_metadata.csv", index=False)
df_token_meta.to_csv(args.out_dir / "token_metadata.csv", index=False)
write_json(args.out_dir / "token_vocab.json", vocab_payload(stoi, itos, configs_by_key))
# Board summary statistics
board_summary = (
df_routes.groupby("board_key")
.agg(
n_routes=("uuid", "count"),
mean_angle=("angle", "mean"),
mean_display_difficulty=("display_difficulty", "mean"),
mean_holds=("n_holds", "mean"),
)
.reset_index()
)
board_summary.to_csv(args.out_dir / "board_summary.csv", index=False)
print(f"\n{'='*60}")
print(f"Tokenization complete!")
print(f"{'='*60}")
print(f"Boards: {board_keys}")
print(f"Tokenized routes: {len(df_routes):,}")
print(f"Vocabulary size: {len(stoi):,}")
print(f"Split counts:")
print(df_routes.groupby(["board_key", "split"]).size().unstack(fill_value=0))
print(f"\nSaved artifacts to: {args.out_dir}")
for f in sorted(args.out_dir.iterdir()):
size_mb = f.stat().st_size / 1e6
print(f" {f.name} ({size_mb:.1f} MB)")
if __name__ == "__main__":
main()
+393
View File
@@ -0,0 +1,393 @@
#!/usr/bin/env python3
"""
ClimbingBoardGPT — Grade Prediction Training Script
This script trains a BERT-style transformer encoder to predict climb difficulty
from tokenized route sequences.
Architecture Overview:
----------------------
The model is a Transformer Encoder (similar to BERT) with a regression head:
Input: <CLS> <BOARD_TB2> <ANGLE_40> <TB2_p344_start> ... <TB2_p603_finish>
Token Embedding + Position Embedding + Coordinate Features
Transformer Encoder (4 layers, 4 heads, d_model=128)
<CLS> token output (pooled representation of the entire sequence)
MLP Head → single scalar (predicted difficulty)
Key Concepts:
1. <CLS> pooling: The <CLS> token aggregates information from the entire
sequence via self-attention. This is the standard BERT approach for
sequence-level tasks.
2. Coordinate features: Each hold token has physical (x, y) position
information that gets projected and added to the embedding. This gives
the model direct spatial knowledge without needing to learn it from data.
3. No grade token in input: The grade predictor must PREDICT the grade,
not see it. We use the "no_grade" token sequence.
4. MSE loss: Since we're predicting a continuous value (difficulty score),
we use Mean Squared Error loss rather than cross-entropy.
5. Joint training: Both TB2 and Kilter routes are trained together,
with <BOARD_TB2> / <BOARD_KILTER> tokens telling the model which
board it's operating on.
Usage:
python scripts/02_train_grade_predictor.py
python scripts/02_train_grade_predictor.py --epochs 100 --lr 1e-4
"""
from __future__ import annotations
import argparse
import json
import sys
from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(REPO_ROOT / "src"))
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from climbingboardgpt.datasets import RouteGradeDataset
from climbingboardgpt.grades import to_grouped_v
from climbingboardgpt.metrics import metrics_by_board, print_metrics, regression_metrics
from climbingboardgpt.models import JointRouteTransformerRegressor
from climbingboardgpt.utils import set_seed, write_json
def parse_args() -> argparse.Namespace:
"""Parse command-line arguments for grade predictor training.
Key hyperparameters:
--epochs: Maximum training epochs (default: 75)
--patience: Early stopping patience (default: 12)
--batch-size: Batch size for training (default: 128)
--lr: Learning rate (default: 3e-4)
--d-model: Transformer embedding dimension (default: 128)
--nhead: Number of attention heads (default: 4)
--num-layers: Number of transformer layers (default: 4)
"""
parser = argparse.ArgumentParser(
description="Train a joint TB2/Kilter transformer grade predictor.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
The model predicts display_difficulty (a continuous value) from tokenized
route sequences. Evaluation metrics include MAE, RMSE, R², and V-grade
accuracy (within ±1 V-grade).
""",
)
parser.add_argument("--tokenized-dir", type=Path, default=REPO_ROOT / "data" / "processed" / "tokenized")
parser.add_argument("--out-dir", type=Path, default=REPO_ROOT / "data" / "processed" / "grade_prediction")
parser.add_argument("--model-dir", type=Path, default=REPO_ROOT / "models")
parser.add_argument("--epochs", type=int, default=75, help="Maximum training epochs")
parser.add_argument("--patience", type=int, default=12, help="Early stopping patience")
parser.add_argument("--batch-size", type=int, default=128, help="Training batch size")
parser.add_argument("--lr", type=float, default=3e-4, help="Learning rate")
parser.add_argument("--weight-decay", type=float, default=1e-2, help="AdamW weight decay")
parser.add_argument("--d-model", type=int, default=128, help="Transformer embedding dimension")
parser.add_argument("--nhead", type=int, default=4, help="Number of attention heads")
parser.add_argument("--num-layers", type=int, default=4, help="Number of transformer layers")
parser.add_argument("--dim-feedforward", type=int, default=256, help="Feedforward dimension")
parser.add_argument("--dropout", type=float, default=0.10, help="Dropout probability")
parser.add_argument("--seed", type=int, default=42, help="Random seed")
parser.add_argument("--device", type=str, default=None, help="Device (cpu or cuda)")
return parser.parse_args()
def build_coord_features(df_token_meta: pd.DataFrame, vocab_size: int) -> torch.Tensor:
"""Build coordinate feature matrix for the transformer model.
Each token gets a 3-dimensional feature vector:
- x_norm: Normalized horizontal position on the board (-1 to 1)
- y_norm: Normalized vertical position on the board (-1 to 1)
- is_hold: 1 if this token represents a hold, 0 otherwise
These features are projected through a linear layer and added to
the token embeddings, giving the model direct spatial information.
This is analogous to how some vision-language models inject spatial
features from images alongside text tokens.
Args:
df_token_meta: DataFrame with token metadata
vocab_size: Total vocabulary size
Returns:
Tensor of shape (vocab_size, 3) with coordinate features
"""
features = np.zeros((vocab_size, 3), dtype=np.float32)
for _, row in df_token_meta.iterrows():
token_id = int(row["token_id"])
features[token_id, 0] = 0.0 if pd.isna(row.get("x_norm", 0.0)) else float(row.get("x_norm", 0.0))
features[token_id, 1] = 0.0 if pd.isna(row.get("y_norm", 0.0)) else float(row.get("y_norm", 0.0))
features[token_id, 2] = 0.0 if pd.isna(row.get("is_hold", 0.0)) else float(row.get("is_hold", 0.0))
return torch.tensor(features, dtype=torch.float32)
def run_epoch(model, loader, device, optimizer=None):
"""Run one epoch of training or evaluation.
Args:
model: The transformer model
loader: DataLoader for this epoch
device: torch device (cpu or cuda)
optimizer: If provided, run training (with gradient updates).
If None, run evaluation (no gradient updates).
Returns:
Tuple of (average_loss, predictions, targets, uuids, board_keys)
"""
is_train = optimizer is not None
model.train(is_train)
criterion = nn.MSELoss()
losses, preds, targets, uuids, boards = [], [], [], [], []
for batch in loader:
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
target = batch["target"].to(device)
if is_train:
optimizer.zero_grad(set_to_none=True)
# Forward pass: model predicts difficulty from token sequence
pred = model(input_ids, attention_mask)
loss = criterion(pred, target)
if is_train:
# Backward pass: compute gradients and update weights
loss.backward()
# Gradient clipping prevents exploding gradients
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
losses.append(loss.item() * input_ids.size(0))
preds.extend(pred.detach().cpu().numpy().tolist())
targets.extend(target.detach().cpu().numpy().tolist())
uuids.extend(batch["uuid"])
boards.extend(batch["board_key"])
avg_loss = sum(losses) / max(1, len(loader.dataset))
return avg_loss, np.asarray(preds), np.asarray(targets), uuids, boards
def main() -> None:
"""Main training loop for the grade predictor.
Steps:
1. Load tokenized data and vocabulary
2. Prepare input sequences (with <CLS> token, without grade)
3. Build coordinate features matrix
4. Create train/val/test DataLoaders
5. Initialize transformer model
6. Train with early stopping
7. Evaluate on test set
8. Save model checkpoint and metrics
"""
args = parse_args()
set_seed(args.seed)
args.out_dir.mkdir(parents=True, exist_ok=True)
args.model_dir.mkdir(parents=True, exist_ok=True)
# ─────────────────────────────────────────────────────────────────────
# Step 1: Load tokenized data
# ─────────────────────────────────────────────────────────────────────
seq_path = args.tokenized_dir / "route_sequences.csv"
vocab_path = args.tokenized_dir / "token_vocab.json"
meta_path = args.tokenized_dir / "token_metadata.csv"
if not seq_path.exists() or not vocab_path.exists() or not meta_path.exists():
raise FileNotFoundError("Missing tokenized artifacts. Run scripts/01_tokenize_routes.py first.")
df_routes = pd.read_csv(seq_path)
vocab = json.loads(vocab_path.read_text(encoding="utf-8"))
stoi = {str(k): int(v) for k, v in vocab["stoi"].items()}
itos = {int(k): str(v) for k, v in vocab["itos"].items()}
df_token_meta = pd.read_csv(meta_path)
pad_id = stoi["<PAD>"]
unk_id = stoi["<UNK>"]
# ─────────────────────────────────────────────────────────────────────
# Step 2: Prepare input sequences
# ─────────────────────────────────────────────────────────────────────
# For grade prediction, we use the "no_grade" version of the sequence
# and prepend <CLS> for sequence-level pooling.
# The model must PREDICT the grade, not see it in the input!
def encode(tokens):
return [stoi.get(token, unk_id) for token in tokens]
df_routes["tokens_no_grade"] = df_routes["sequence_no_grade"].fillna("").str.split()
df_routes["model_tokens"] = df_routes["tokens_no_grade"].apply(
lambda tokens: ["<CLS>"] + tokens[1:] if tokens else ["<CLS>"]
)
df_routes["model_ids"] = df_routes["model_tokens"].apply(encode)
df_routes["seq_len"] = df_routes["model_ids"].apply(len)
max_len = int(df_routes["seq_len"].max())
# ─────────────────────────────────────────────────────────────────────
# Step 3: Create DataLoaders
# ─────────────────────────────────────────────────────────────────────
train_df = df_routes[df_routes["split"] == "train"].reset_index(drop=True)
val_df = df_routes[df_routes["split"] == "val"].reset_index(drop=True)
test_df = df_routes[df_routes["split"] == "test"].reset_index(drop=True)
train_ds = RouteGradeDataset(train_df, max_len=max_len, pad_id=pad_id)
val_ds = RouteGradeDataset(val_df, max_len=max_len, pad_id=pad_id)
test_ds = RouteGradeDataset(test_df, max_len=max_len, pad_id=pad_id)
train_loader = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=args.batch_size, shuffle=False)
test_loader = DataLoader(test_ds, batch_size=args.batch_size, shuffle=False)
# ─────────────────────────────────────────────────────────────────────
# Step 4: Initialize model
# ─────────────────────────────────────────────────────────────────────
device = torch.device(args.device or ("cuda" if torch.cuda.is_available() else "cpu"))
coord_features = build_coord_features(df_token_meta, vocab_size=len(stoi))
model = JointRouteTransformerRegressor(
vocab_size=len(stoi),
max_len=max_len,
coord_features=coord_features,
d_model=args.d_model,
nhead=args.nhead,
num_layers=args.num_layers,
dim_feedforward=args.dim_feedforward,
dropout=args.dropout,
pad_id=pad_id,
).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
print(f"Device: {device}")
print(f"Train/val/test: {len(train_ds):,}, {len(val_ds):,}, {len(test_ds):,}")
print(f"Vocabulary size: {len(stoi):,}")
print(f"Max sequence length: {max_len}")
print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")
# ─────────────────────────────────────────────────────────────────────
# Step 5: Training loop with early stopping
# ─────────────────────────────────────────────────────────────────────
history = []
best_val_mae = float("inf")
best_state = None
best_epoch = 0
epochs_without_improvement = 0
print("\nStarting training...")
for epoch in range(1, args.epochs + 1):
train_loss, train_pred, train_true, _, _ = run_epoch(model, train_loader, device, optimizer)
val_loss, val_pred, val_true, _, _ = run_epoch(model, val_loader, device, optimizer=None)
train_metrics = regression_metrics(train_true, train_pred)
val_metrics = regression_metrics(val_true, val_pred)
history.append({
"epoch": epoch,
"train_loss": train_loss,
"val_loss": val_loss,
"train_mae": train_metrics["mae"],
"val_mae": val_metrics["mae"],
"train_r2": train_metrics["r2"],
"val_r2": val_metrics["r2"],
"val_within_1_vgrade": val_metrics["within_1_vgrade"],
})
# Track best model by validation MAE
if val_metrics["mae"] < best_val_mae:
best_val_mae = val_metrics["mae"]
best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
best_epoch = epoch
epochs_without_improvement = 0
else:
epochs_without_improvement += 1
if epoch == 1 or epoch % 5 == 0 or epoch == best_epoch:
print(
f"Epoch {epoch:03d} | "
f"train MAE {train_metrics['mae']:.3f} | "
f"val MAE {val_metrics['mae']:.3f} | "
f"val R² {val_metrics['r2']:.3f} | "
f"val ±1V {val_metrics['within_1_vgrade']:.1f}%"
)
if epochs_without_improvement >= args.patience:
print(f"Early stopping at epoch {epoch}; best epoch was {best_epoch}.")
break
# Load best model
if best_state is not None:
model.load_state_dict(best_state)
# ─────────────────────────────────────────────────────────────────────
# Step 6: Test set evaluation
# ─────────────────────────────────────────────────────────────────────
test_loss, test_pred, test_true, test_uuid, test_board = run_epoch(model, test_loader, device, optimizer=None)
overall_metrics = regression_metrics(test_true, test_pred)
pred_df = pd.DataFrame({
"uuid": test_uuid,
"board_key": test_board,
"y_true": test_true,
"y_pred": test_pred,
"abs_error": np.abs(test_true - test_pred),
"true_v": [to_grouped_v(value) for value in test_true],
"pred_v": [to_grouped_v(value) for value in test_pred],
})
pred_df = pred_df.merge(
df_routes[["uuid", "climb_name", "angle", "boulder_grade", "sequence_no_grade"]],
on="uuid",
how="left",
)
board_metrics_df = metrics_by_board(pred_df)
print_metrics("Overall joint test performance", overall_metrics)
print("\nBoard-specific test performance:")
print(board_metrics_df.to_string(index=False))
# ─────────────────────────────────────────────────────────────────────
# Step 7: Save artifacts
# ─────────────────────────────────────────────────────────────────────
pd.DataFrame(history).to_csv(args.out_dir / "training_history.csv", index=False)
pred_df.to_csv(args.out_dir / "test_predictions.csv", index=False)
board_metrics_df.to_csv(args.out_dir / "board_metrics.csv", index=False)
write_json(args.out_dir / "overall_metrics.json", overall_metrics)
# Save model checkpoint with all necessary info for loading
checkpoint = {
"model_state_dict": model.state_dict(),
"config": {
"vocab_size": len(stoi),
"max_len": max_len,
"d_model": args.d_model,
"nhead": args.nhead,
"num_layers": args.num_layers,
"dim_feedforward": args.dim_feedforward,
"dropout": args.dropout,
"pad_id": pad_id,
},
"stoi": stoi,
"itos": {str(k): v for k, v in itos.items()},
"coord_features": coord_features.cpu(),
"overall_metrics": overall_metrics,
}
model_path = args.model_dir / "joint_transformer_grade_predictor.pth"
torch.save(checkpoint, model_path)
print("\nSaved:")
print(f" {args.out_dir}")
print(f" {model_path}")
if __name__ == "__main__":
main()
+388
View File
@@ -0,0 +1,388 @@
#!/usr/bin/env python3
"""
ClimbingBoardGPT — Route Generation Training Script
This script trains a GPT-style causal transformer to generate new climbing
routes conditioned on board type, angle, and target grade.
Architecture Overview:
----------------------
The model is a causal (autoregressive) transformer decoder:
Input: <BOS> <BOARD_TB2> <ANGLE_40> <GRADE_V6> <TB2_p344_start> ...
Token Embedding + Position Embedding
Causal Transformer (4 layers, 4 heads, d_embd=128)
[Each position can only attend to previous positions]
Language Modeling Head → next token logits
Sample next token → append to sequence → repeat
Key Concepts:
1. Causal masking: Unlike BERT which sees all tokens, GPT can only
attend to previous tokens. This enables autoregressive generation.
2. Teacher forcing: During training, we feed the ground-truth previous
token. During generation, we feed the model's own prediction.
3. Weight tying: The output projection shares weights with the input
embedding. This reduces parameters and improves training stability.
4. Temperature & top-k sampling: Control generation diversity.
- Low temperature (0.3) → conservative, realistic routes
- High temperature (1.5) → creative, unusual routes
- Top-k (default 50) → only consider the 50 most likely next tokens
5. Conditioning: The prompt tokens (<BOARD_...>, <ANGLE_...>, <GRADE_...>)
tell the model what kind of route to generate, similar to how
ChatGPT uses system prompts.
Usage:
python scripts/03_train_route_generator.py
python scripts/03_train_route_generator.py --epochs 100 --temperature 0.7
python scripts/03_train_route_generator.py --generate-board tb2 --generate-grades 3,5,7
"""
from __future__ import annotations
import argparse
import json
import math
import sys
from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(REPO_ROOT / "src"))
import pandas as pd
import torch
from torch.utils.data import DataLoader
from climbingboardgpt.config import load_board_configs, parse_board_keys
from climbingboardgpt.datasets import RouteGPTDataset
from climbingboardgpt.generation import generate_one
from climbingboardgpt.models import JointRouteGPT
from climbingboardgpt.utils import set_seed
def csv_ints(value: str | None) -> list[int] | None:
"""Parse a comma-separated string of integers, or return None."""
if value is None or not value.strip():
return None
return [int(part.strip()) for part in value.split(",") if part.strip()]
def parse_args() -> argparse.Namespace:
"""Parse command-line arguments for route generator training."""
parser = argparse.ArgumentParser(
description="Train a joint TB2/Kilter GPT-style route generator.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
After training, the script generates sample routes for each board at
common angles and grades. Use --generate-board to generate for a
specific board, or leave unset to generate for all boards.
""",
)
parser.add_argument("--tokenized-dir", type=Path, default=REPO_ROOT / "data" / "processed" / "tokenized")
parser.add_argument("--out-dir", type=Path, default=REPO_ROOT / "data" / "processed" / "generation")
parser.add_argument("--model-dir", type=Path, default=REPO_ROOT / "models")
parser.add_argument("--boards", type=str, default="tb2,kilter", help="Board configs for role reconstruction")
parser.add_argument("--epochs", type=int, default=60, help="Maximum training epochs")
parser.add_argument("--patience", type=int, default=10, help="Early stopping patience")
parser.add_argument("--batch-size", type=int, default=128, help="Training batch size")
parser.add_argument("--lr", type=float, default=3e-4, help="Learning rate")
parser.add_argument("--weight-decay", type=float, default=1e-2, help="AdamW weight decay")
parser.add_argument("--n-embd", type=int, default=128, help="Embedding dimension")
parser.add_argument("--n-head", type=int, default=4, help="Number of attention heads")
parser.add_argument("--n-layer", type=int, default=4, help="Number of transformer layers")
parser.add_argument("--dropout", type=float, default=0.10, help="Dropout probability")
parser.add_argument("--temperature", type=float, default=0.9, help="Sampling temperature")
parser.add_argument("--top-k", type=int, default=50, help="Top-k sampling parameter")
parser.add_argument("--max-new-tokens", type=int, default=40, help="Max tokens to generate")
parser.add_argument("--n-per-condition", type=int, default=10, help="Routes to generate per condition")
parser.add_argument("--generate-board", type=str, default=None, help="Board key: tb2 or kilter")
parser.add_argument("--generate-angles", type=str, default=None, help="Comma-separated angles")
parser.add_argument("--generate-grades", type=str, default=None, help="Comma-separated V-grades")
parser.add_argument("--seed", type=int, default=42, help="Random seed")
parser.add_argument("--device", type=str, default=None, help="Device (cpu or cuda)")
return parser.parse_args()
def evaluate_loss(model, loader, device) -> float:
"""Evaluate the model on a data loader, returning average loss.
This is used for validation and test evaluation. The model is set to
eval mode and no gradients are computed.
"""
model.eval()
losses = []
n = 0
with torch.no_grad():
for batch in loader:
x = batch["input_ids"].to(device)
y = batch["target_ids"].to(device)
_, loss = model(x, y)
batch_size = x.size(0)
losses.append(loss.item() * batch_size)
n += batch_size
return sum(losses) / max(1, n)
def train_one_epoch(model, loader, optimizer, device) -> float:
"""Train for one epoch, returning average loss.
Uses teacher forcing: the model receives ground-truth previous tokens
and predicts the next token. This is standard for language model training.
"""
model.train()
losses = []
n = 0
for batch in loader:
x = batch["input_ids"].to(device)
y = batch["target_ids"].to(device)
optimizer.zero_grad(set_to_none=True)
_, loss = model(x, y)
loss.backward()
# Gradient clipping prevents exploding gradients
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
batch_size = x.size(0)
losses.append(loss.item() * batch_size)
n += batch_size
return sum(losses) / max(1, n)
def main() -> None:
"""Main training and generation loop.
Steps:
1. Load tokenized data and vocabulary
2. Prepare input/target pairs for causal language modeling
3. Create train/val DataLoaders
4. Initialize GPT model
5. Train with early stopping
6. Generate sample routes for evaluation
7. Save model checkpoint and generated routes
"""
args = parse_args()
set_seed(args.seed)
args.out_dir.mkdir(parents=True, exist_ok=True)
args.model_dir.mkdir(parents=True, exist_ok=True)
# ─────────────────────────────────────────────────────────────────────
# Step 1: Load data
# ─────────────────────────────────────────────────────────────────────
seq_path = args.tokenized_dir / "route_sequences.csv"
vocab_path = args.tokenized_dir / "token_vocab.json"
if not seq_path.exists() or not vocab_path.exists():
raise FileNotFoundError("Missing tokenized artifacts. Run scripts/01_tokenize_routes.py first.")
df_routes = pd.read_csv(seq_path)
vocab = json.loads(vocab_path.read_text(encoding="utf-8"))
stoi = {str(k): int(v) for k, v in vocab["stoi"].items()}
itos = {int(k): str(v) for k, v in vocab["itos"].items()}
pad_id = stoi["<PAD>"]
unk_id = stoi["<UNK>"]
# ─────────────────────────────────────────────────────────────────────
# Step 2: Prepare sequences for causal language modeling
# ─────────────────────────────────────────────────────────────────────
# For GPT training, we use the "with grade" version because the model
# needs to learn the relationship between grade and hold selection.
#
# Input: <BOS> <BOARD_TB2> <ANGLE_40> <GRADE_V6> <TB2_p344_start> ...
# Target: <BOARD_TB2> <ANGLE_40> <GRADE_V6> <TB2_p344_start> <TB2_p369_middle> ...
#
# The input is shifted right by one position compared to the target.
# This is the standard causal language modeling setup.
def encode(tokens):
return [stoi.get(token, unk_id) for token in tokens]
df_routes["gpt_tokens"] = df_routes["sequence_with_grade"].fillna("").str.split()
df_routes["gpt_ids"] = df_routes["gpt_tokens"].apply(encode)
df_routes["seq_len"] = df_routes["gpt_ids"].apply(len)
max_len = int(df_routes["seq_len"].max())
if max_len < 2:
raise RuntimeError("Token sequences are too short to train the causal model.")
block_size = max_len - 1 # Input length (one less than full sequence)
# ─────────────────────────────────────────────────────────────────────
# Step 3: Create DataLoaders
# ─────────────────────────────────────────────────────────────────────
train_df = df_routes[df_routes["split"] == "train"].reset_index(drop=True)
val_df = df_routes[df_routes["split"] == "val"].reset_index(drop=True)
test_df = df_routes[df_routes["split"] == "test"].reset_index(drop=True)
train_ds = RouteGPTDataset(train_df, max_len=max_len, pad_id=pad_id)
val_ds = RouteGPTDataset(val_df, max_len=max_len, pad_id=pad_id)
test_ds = RouteGPTDataset(test_df, max_len=max_len, pad_id=pad_id)
train_loader = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=args.batch_size, shuffle=False)
test_loader = DataLoader(test_ds, batch_size=args.batch_size, shuffle=False)
# ─────────────────────────────────────────────────────────────────────
# Step 4: Initialize model
# ─────────────────────────────────────────────────────────────────────
device = torch.device(args.device or ("cuda" if torch.cuda.is_available() else "cpu"))
model = JointRouteGPT(
vocab_size=len(stoi),
block_size=block_size,
n_embd=args.n_embd,
n_head=args.n_head,
n_layer=args.n_layer,
dropout=args.dropout,
pad_id=pad_id,
).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
print(f"Device: {device}")
print(f"Train/val/test: {len(train_ds):,}, {len(val_ds):,}, {len(test_ds):,}")
print(f"Vocabulary size: {len(stoi):,}")
print(f"Block size: {block_size}")
print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")
# ─────────────────────────────────────────────────────────────────────
# Step 5: Training loop with early stopping
# ─────────────────────────────────────────────────────────────────────
# We track perplexity (exp(loss)) as well as raw loss.
# Perplexity answers: "On average, how many tokens was the model
# choosing between at each step?"
# Lower perplexity = better model.
history = []
best_val_loss = float("inf")
best_state = None
best_epoch = 0
epochs_without_improvement = 0
print("\nStarting GPT training...")
for epoch in range(1, args.epochs + 1):
train_loss = train_one_epoch(model, train_loader, optimizer, device)
val_loss = evaluate_loss(model, val_loader, device)
history.append({
"epoch": epoch,
"train_loss": train_loss,
"val_loss": val_loss,
"train_perplexity": math.exp(min(train_loss, 20)),
"val_perplexity": math.exp(min(val_loss, 20)),
})
if val_loss < best_val_loss:
best_val_loss = val_loss
best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
best_epoch = epoch
epochs_without_improvement = 0
else:
epochs_without_improvement += 1
if epoch == 1 or epoch % 5 == 0 or epoch == best_epoch:
print(
f"Epoch {epoch:03d} | "
f"train loss {train_loss:.3f} | "
f"val loss {val_loss:.3f} | "
f"val ppl {math.exp(min(val_loss, 20)):.1f}"
)
if epochs_without_improvement >= args.patience:
print(f"Early stopping at epoch {epoch}; best epoch was {best_epoch}.")
break
if best_state is not None:
model.load_state_dict(best_state)
# ─────────────────────────────────────────────────────────────────────
# Step 6: Test evaluation
# ─────────────────────────────────────────────────────────────────────
test_loss = evaluate_loss(model, test_loader, device)
print(f"\nBest validation loss: {best_val_loss:.4f}")
print(f"Test loss: {test_loss:.4f}")
print(f"Test perplexity: {math.exp(min(test_loss, 20)):.1f}")
# ─────────────────────────────────────────────────────────────────────
# Step 7: Generate sample routes
# ─────────────────────────────────────────────────────────────────────
# For each board, generate routes at common angles and grades.
# This demonstrates the model's ability to produce novel routes
# conditioned on board, angle, and difficulty.
configs = load_board_configs(parse_board_keys(args.boards))
configs_by_key = {config.board_key: config for config in configs}
board_keys_to_generate = [args.generate_board] if args.generate_board else sorted(df_routes["board_key"].unique())
requested_angles = csv_ints(args.generate_angles)
requested_grades = csv_ints(args.generate_grades)
generated = []
for board_key in board_keys_to_generate:
board_frame = df_routes[df_routes["board_key"] == board_key]
if board_frame.empty:
continue
config = configs_by_key[board_key]
# Use common angles if none specified
angles = requested_angles or (
board_frame["angle"].astype(int).value_counts().head(5).index.sort_values().tolist()
)
# Use common grades if none specified
grades = requested_grades or (
board_frame["grouped_v"].astype(int).value_counts().head(8).index.sort_values().tolist()
)
for angle in angles:
for grade in grades:
for _ in range(args.n_per_condition):
generated.append({
"board_key": board_key,
**generate_one(
model=model,
stoi=stoi,
itos=itos,
device=device,
board_prefix=config.token_prefix,
angle=int(angle),
grouped_v=int(grade),
role_name_to_id=config.role_definitions,
temperature=args.temperature,
top_k=args.top_k,
max_new_tokens=args.max_new_tokens,
),
})
generated_df = pd.DataFrame(generated)
if not generated_df.empty:
print(f"\nGenerated routes: {len(generated_df):,}")
print("Basic validity by board:")
print(generated_df.groupby("board_key")["basic_valid"].mean())
# ─────────────────────────────────────────────────────────────────────
# Step 8: Save artifacts
# ─────────────────────────────────────────────────────────────────────
pd.DataFrame(history).to_csv(args.out_dir / "training_history.csv", index=False)
generated_df.to_csv(args.out_dir / "generated_routes.csv", index=False)
checkpoint = {
"model_state_dict": model.state_dict(),
"config": {
"vocab_size": len(stoi),
"block_size": block_size,
"n_embd": args.n_embd,
"n_head": args.n_head,
"n_layer": args.n_layer,
"dropout": args.dropout,
"pad_id": pad_id,
},
"stoi": stoi,
"itos": {str(k): v for k, v in itos.items()},
"best_val_loss": best_val_loss,
"test_loss": test_loss,
}
model_path = args.model_dir / "joint_route_gpt_generator.pth"
torch.save(checkpoint, model_path)
print("\nSaved:")
print(f" {args.out_dir}")
print(f" {model_path}")
if __name__ == "__main__":
main()
+328
View File
@@ -0,0 +1,328 @@
#!/usr/bin/env python3
"""
ClimbingBoardGPT — Generated Route Evaluation Script
This script evaluates routes generated by the GPT model on four dimensions:
1. Validity: Does the route follow structural rules?
- At least 3 holds
- No duplicate placements
- At least one start and one finish hold
- All holds from the same board
2. Novelty: Is the route different from existing climbs?
- Measured by Jaccard distance from the nearest real route
3. Geometric plausibility: Are holds in reasonable positions?
- Height, width, mean hand reach distance
4. Grade consistency: Does the route's predicted grade match the request?
- Uses the trained grade predictor as a "critic"
This is analogous to how language models are evaluated using BLEU, ROUGE,
or human evaluation — but adapted for the climbing domain.
Usage:
python scripts/04_evaluate_generated_routes.py
python scripts/04_evaluate_generated_routes.py --grade-model-path models/joint_transformer_grade_predictor.pth
"""
from __future__ import annotations
import argparse
import sys
from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(REPO_ROOT / "src"))
import numpy as np
import pandas as pd
import torch
from climbingboardgpt.evaluation import (
build_placement_coords,
frames_to_holds,
holds_to_placement_set,
nearest_real_route_same_board,
parse_token_list,
simple_route_features,
tokens_to_hold_records,
validity_from_records,
)
from climbingboardgpt.grades import to_grouped_v
from climbingboardgpt.models import JointRouteTransformerRegressor
def parse_args() -> argparse.Namespace:
"""Parse command-line arguments for route evaluation."""
parser = argparse.ArgumentParser(
description="Evaluate generated TB2/Kilter route candidates.",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument("--tokenized-dir", type=Path, default=REPO_ROOT / "data" / "processed" / "tokenized")
parser.add_argument("--generated-dir", type=Path, default=REPO_ROOT / "data" / "processed" / "generation")
parser.add_argument("--out-dir", type=Path, default=REPO_ROOT / "data" / "processed" / "evaluation")
parser.add_argument("--grade-model-path", type=Path, default=REPO_ROOT / "models" / "joint_transformer_grade_predictor.pth")
parser.add_argument("--device", type=str, default=None)
return parser.parse_args()
def load_grade_critic(model_path: Path, device: torch.device):
"""Load the trained grade predictor model as a critic.
The critic is used to predict the difficulty of generated routes.
If we asked for V6 and the critic predicts V6 ± 1, the generation
is grade-consistent.
This is similar to how GANs use a discriminator, except our critic
is a regression model rather than a binary classifier.
Args:
model_path: Path to the saved model checkpoint
device: torch device
Returns:
Dictionary with model, vocabulary, and config, or None if not found
"""
if not model_path.exists():
return None
try:
checkpoint = torch.load(model_path, map_location=device, weights_only=False)
except TypeError:
checkpoint = torch.load(model_path, map_location=device)
cfg = checkpoint["config"]
stoi = {str(k): int(v) for k, v in checkpoint["stoi"].items()}
coord_features = checkpoint["coord_features"]
if not isinstance(coord_features, torch.Tensor):
coord_features = torch.tensor(coord_features, dtype=torch.float32)
model = JointRouteTransformerRegressor(
vocab_size=cfg["vocab_size"],
max_len=cfg["max_len"],
coord_features=coord_features,
d_model=cfg.get("d_model", 128),
nhead=cfg.get("nhead", 4),
num_layers=cfg.get("num_layers", 4),
dim_feedforward=cfg.get("dim_feedforward", 256),
dropout=cfg.get("dropout", 0.10),
pad_id=cfg.get("pad_id", stoi["<PAD>"]),
).to(device)
model.load_state_dict(checkpoint["model_state_dict"])
model.eval()
return {
"model": model,
"stoi": stoi,
"pad_id": stoi["<PAD>"],
"unk_id": stoi["<UNK>"],
"max_len": cfg["max_len"],
}
def predict_generated_grade(tokens: list[str], critic, device: torch.device) -> float:
"""Use the critic model to predict the difficulty of a generated route.
Args:
tokens: List of token strings (from generated route)
critic: Dictionary with model and vocabulary
device: torch device
Returns:
Predicted difficulty score (continuous value)
"""
model = critic["model"]
stoi = critic["stoi"]
pad_id = critic["pad_id"]
unk_id = critic["unk_id"]
max_len = critic["max_len"]
# Remove grade tokens (we want the model to predict, not see the grade)
tokens = [token for token in tokens if not token.startswith("<GRADE_")]
# Replace <BOS> with <CLS> for the encoder model
if tokens and tokens[0] == "<BOS>":
tokens = ["<CLS>"] + tokens[1:]
else:
tokens = ["<CLS>"] + tokens
# Encode tokens to IDs and pad to max_len
ids = [stoi.get(token, unk_id) for token in tokens][:max_len]
mask = [1] * len(ids)
if len(ids) < max_len:
pad_n = max_len - len(ids)
ids += [pad_id] * pad_n
mask += [0] * pad_n
with torch.no_grad():
input_ids = torch.tensor([ids], dtype=torch.long, device=device)
attention_mask = torch.tensor([mask], dtype=torch.bool, device=device)
return float(model(input_ids, attention_mask).cpu().item())
def main() -> None:
"""Main evaluation pipeline.
Steps:
1. Load generated routes and real routes
2. Parse tokens and check validity
3. Compute novelty (Jaccard distance from nearest real route)
4. Compute geometric features
5. Optionally use critic model for grade consistency
6. Rank routes by composite score
7. Save evaluation results
"""
args = parse_args()
args.out_dir.mkdir(parents=True, exist_ok=True)
# ─────────────────────────────────────────────────────────────────────
# Step 1: Load data
# ─────────────────────────────────────────────────────────────────────
generated_path = args.generated_dir / "generated_routes.csv"
routes_path = args.tokenized_dir / "route_sequences.csv"
token_meta_path = args.tokenized_dir / "token_metadata.csv"
if not generated_path.exists():
raise FileNotFoundError("Missing generated routes. Run scripts/03_train_route_generator.py first.")
if not routes_path.exists() or not token_meta_path.exists():
raise FileNotFoundError("Missing tokenized artifacts. Run scripts/01_tokenize_routes.py first.")
df_generated = pd.read_csv(generated_path)
df_real = pd.read_csv(routes_path)
df_token_meta = pd.read_csv(token_meta_path)
# ─────────────────────────────────────────────────────────────────────
# Step 2: Parse tokens and check validity
# ─────────────────────────────────────────────────────────────────────
# Validity checks ensure generated routes are structurally sound:
# - basic_valid: ≥3 holds, no duplicates, has start+finish, one board
# - strict_valid: basic_valid + has middle + ≥4 holds
df_generated["tokens_parsed"] = df_generated["tokens"].apply(parse_token_list)
df_generated["hold_records"] = df_generated["tokens_parsed"].apply(tokens_to_hold_records)
df_generated["hold_set"] = df_generated["hold_records"].apply(
lambda records: frozenset(int(record["placement_id"]) for record in records)
)
validity = pd.DataFrame(df_generated["hold_records"].apply(validity_from_records).tolist())
df_eval = pd.concat([df_generated.reset_index(drop=True), validity], axis=1)
print(f"Evaluated generated routes: {len(df_eval):,}")
print("\nBasic validity by board:")
print(df_eval.groupby("board_key")["basic_valid_eval"].mean())
print("\nStrict validity by board:")
print(df_eval.groupby("board_key")["strict_valid_eval"].mean())
# ─────────────────────────────────────────────────────────────────────
# Step 3: Novelty (Jaccard distance from nearest real route)
# ─────────────────────────────────────────────────────────────────────
# For each generated route, find the most similar real route on the
# same board using Jaccard similarity of hold sets.
# Novelty distance = 1 - Jaccard similarity
# A value of 1.0 means completely novel (no shared holds)
# A value of 0.0 means identical to an existing route
df_real["real_holds"] = df_real["frames"].apply(frames_to_holds)
df_real["hold_set"] = df_real["real_holds"].apply(holds_to_placement_set)
nearest = pd.DataFrame(
df_eval.apply(
lambda row: nearest_real_route_same_board(
generated_set=row["hold_set"],
generated_board_key=row["board_key"],
real_df=df_real,
),
axis=1,
).tolist()
)
df_eval = pd.concat([df_eval, nearest], axis=1)
print("\nNovelty statistics:")
print(df_eval[["board_key", "nearest_real_jaccard", "novelty_distance"]].describe())
# ─────────────────────────────────────────────────────────────────────
# Step 4: Geometric features
# ─────────────────────────────────────────────────────────────────────
# Compute simple spatial features for each generated route:
# - Number of holds
# - Height gained (max Y - min Y)
# - Width span (max X - min X)
# - Mean hand reach distance
coords = build_placement_coords(df_token_meta)
geom = pd.DataFrame(
df_eval.apply(
lambda row: simple_route_features(
board_key=row["board_key"],
records=row["hold_records"],
placement_coords=coords,
),
axis=1,
).tolist()
)
df_eval = pd.concat([df_eval, geom], axis=1)
print("\nGeometric feature statistics:")
print(df_eval[["board_key", "geom_n_holds", "geom_height", "geom_width", "geom_mean_hand_reach"]].describe())
# ─────────────────────────────────────────────────────────────────────
# Step 5: Grade consistency (using critic model)
# ─────────────────────────────────────────────────────────────────────
# If a trained grade predictor is available, use it as a "critic"
# to check whether generated routes have grades consistent with
# what was requested.
device = torch.device(args.device or ("cuda" if torch.cuda.is_available() else "cpu"))
critic = load_grade_critic(args.grade_model_path, device)
if critic is not None:
print("\nUsing grade critic for consistency scoring...")
df_eval["critic_pred_display_difficulty"] = df_eval["tokens_parsed"].apply(
lambda tokens: predict_generated_grade(tokens, critic, device)
)
df_eval["critic_pred_grouped_v"] = df_eval["critic_pred_display_difficulty"].apply(to_grouped_v)
df_eval["critic_v_error"] = df_eval["critic_pred_grouped_v"] - df_eval["requested_grouped_v"]
print("\nCritic grade consistency by board:")
summary = df_eval.groupby("board_key")["critic_v_error"].agg(
exact=lambda s: float((s == 0).mean() * 100),
within_1=lambda s: float((s.abs() <= 1).mean() * 100),
within_2=lambda s: float((s.abs() <= 2).mean() * 100),
)
print(summary)
else:
print("No trained grade critic found. Skipping critic-based scoring.")
# ─────────────────────────────────────────────────────────────────────
# Step 6: Rank routes by composite score
# ─────────────────────────────────────────────────────────────────────
# The composite score rewards:
# - Basic validity (weight 2.0)
# - Strict validity (weight 1.0)
# - Novelty (weight 1.0)
# - Grade consistency (weight 1.0 for ±1 V-grade, penalty for larger errors)
ranked = df_eval.copy()
ranked["score"] = 0.0
ranked["score"] += ranked["basic_valid_eval"].astype(float) * 2.0
ranked["score"] += ranked["strict_valid_eval"].astype(float) * 1.0
ranked["score"] += ranked["novelty_distance"].fillna(0.0)
if "critic_v_error" in ranked.columns:
ranked["score"] += (ranked["critic_v_error"].abs() <= 1).astype(float)
ranked["score"] -= 0.25 * ranked["critic_v_error"].abs()
top_candidates = ranked.sort_values("score", ascending=False).head(100).reset_index(drop=True)
print(f"\nTop 10 generated routes by composite score:")
display_cols = ["board_key", "score", "basic_valid_eval", "strict_valid_eval", "novelty_distance"]
if "critic_v_error" in top_candidates.columns:
display_cols.append("critic_v_error")
print(top_candidates[display_cols].head(10))
# ─────────────────────────────────────────────────────────────────────
# Step 7: Save results
# ─────────────────────────────────────────────────────────────────────
df_eval.to_csv(args.out_dir / "generated_route_evaluation.csv", index=False)
top_candidates.to_csv(args.out_dir / "top_generated_candidates.csv", index=False)
print(f"\nSaved evaluation results to:")
print(f" {args.out_dir / 'generated_route_evaluation.csv'}")
print(f" {args.out_dir / 'top_generated_candidates.csv'}")
if __name__ == "__main__":
main()