initial commit

2026-05-21 07:21:13 -04:00
commit d510d07ed9
50 changed files with 5359 additions and 0 deletions
@@ -0,0 +1,360 @@
+#!/usr/bin/env python3
+"""
+ClimbingBoardGPT — Route Tokenization Script
+
+This script converts raw climbing route data from SQLite databases into
+tokenized sequences suitable for training transformer models.
+
+What is tokenization?
+---------------------
+In NLP, tokenization converts raw text into discrete symbols (tokens) that
+a model can process. For example, GPT-2 uses Byte-Pair Encoding (BPE) to
+split "climbing" into ["cl", "imb", "ing"].
+
+For climbing routes, we tokenize differently:
+- Each hold on the board becomes a unique token (e.g., <TB2_p344_start>)
+- Board identity, angle, and grade become conditioning tokens
+- Special tokens mark sequence boundaries (<BOS>, <EOS>, etc.)
+
+The key insight: climbing routes ARE sequences, just like sentences. The
+same transformer architectures that learn English grammar can learn "climb
+grammar" — which holds tend to follow which, how start holds differ from
+finish holds, etc.
+
+This script:
+1. Loads board configurations from JSON files
+2. Queries SQLite databases for climb and placement data
+3. Parses frame strings (e.g., "p344r5p369r6p603r7") into structured data
+4. Maps board-specific role IDs to shared semantic roles
+5. Canonicalizes hold order (starts first, then middles by Y, etc.)
+6. Generates two token sequences per route:
+   - with_grade: includes <GRADE_V6> for GPT training
+   - without_grade: excludes grade for BERT-style prediction
+7. Builds vocabulary, train/val/test splits, and saves all artifacts
+
+Usage:
+    python scripts/01_tokenize_routes.py --boards tb2,kilter
+    python scripts/01_tokenize_routes.py --boards tb2
+    python scripts/01_tokenize_routes.py --boards kilter
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+# Set up the project root so we can import our custom package
+REPO_ROOT = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(REPO_ROOT / "src"))
+
+import pandas as pd
+
+from climbingboardgpt.config import load_board_configs, parse_board_keys
+from climbingboardgpt.data import load_multi_board_data
+from climbingboardgpt.tokenization import (
+    build_route_records,
+    build_token_metadata,
+    build_vocab,
+    encode,
+    make_placement_lookup,
+    vocab_payload,
+)
+from climbingboardgpt.utils import json_safe, safe_train_test_split, set_seed, write_json
+
+
+def parse_args() -> argparse.Namespace:
+    """Parse command-line arguments for the tokenization script.
+    
+    Key arguments:
+        --boards: Which boards to tokenize (comma-separated). Default: "tb2,kilter"
+        --out-dir: Where to save tokenized artifacts
+        --seed: Random seed for reproducible train/val/test splits
+    """
+    parser = argparse.ArgumentParser(
+        description="Tokenize TB2 and/or Kilter routes for ClimbingBoardGPT.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Tokenize both boards (default)
+  python scripts/01_tokenize_routes.py --boards tb2,kilter
+
+  # Tokenize only TB2
+  python scripts/01_tokenize_routes.py --boards tb2
+
+  # Custom output directory
+  python scripts/01_tokenize_routes.py --out-dir /path/to/output
+        """,
+    )
+    parser.add_argument(
+        "--boards",
+        type=str,
+        default="tb2,kilter",
+        help="Comma-separated board config names (default: tb2,kilter)",
+    )
+    parser.add_argument(
+        "--out-dir",
+        type=Path,
+        default=REPO_ROOT / "data" / "processed" / "tokenized",
+        help="Output directory for tokenized artifacts",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="Random seed for reproducible splits (default: 42)",
+    )
+    return parser.parse_args()
+
+
+def main() -> None:
+    """Main entry point for route tokenization.
+    
+    This function orchestrates the entire tokenization pipeline:
+    1. Load board configurations
+    2. Query databases for raw climb and placement data
+    3. Parse frames strings into structured hold records
+    4. Build tokenized route records with canonical hold ordering
+    5. Construct vocabulary from all unique tokens
+    6. Split data into train/val/test sets (stratified by board × grade)
+    7. Build token metadata (coordinates, roles, etc.)
+    8. Save all artifacts to disk
+    """
+    args = parse_args()
+    
+    # Set random seed for reproducibility
+    # This ensures train/val/test splits are the same across runs
+    set_seed(args.seed)
+    args.out_dir.mkdir(parents=True, exist_ok=True)
+
+    # ─────────────────────────────────────────────────────────────────────
+    # Step 1: Load board configurations
+    # ─────────────────────────────────────────────────────────────────────
+    # Each board has a JSON config file specifying:
+    # - layout_id: Which layout in the database to use
+    # - role_definitions: Maps semantic roles (start, middle, etc.) to numeric IDs
+    # - max_angle: Filter out routes steeper than this
+    # - token_prefix: Namespace for hold tokens (prevents ID collisions)
+    # 
+    # This config-driven approach means adding a new board only requires
+    # creating a new JSON file, not modifying code.
+    board_keys = parse_board_keys(args.boards)
+    configs = load_board_configs(board_keys)
+    configs_by_key = {config.board_key: config for config in configs}
+    configs_by_prefix = {config.token_prefix: config for config in configs}
+
+    print(f"Loaded {len(configs)} board configuration(s):")
+    for config in configs:
+        print(f"  {config.display_name} (key={config.board_key}, prefix={config.token_prefix})")
+        print(f"    layout_id={config.layout_id}, max_angle={config.max_angle}")
+        print(f"    role_definitions={config.role_definitions}")
+
+    # ─────────────────────────────────────────────────────────────────────
+    # Step 2: Load raw data from SQLite databases
+    # ─────────────────────────────────────────────────────────────────────
+    # Each board has its own SQLite database containing:
+    # - climbs table: route metadata (name, setter, frames string, etc.)
+    # - climb_stats table: angle, difficulty, ascensionist count, quality
+    # - placements table: physical hold positions and default roles
+    # - holes table: (x, y) coordinates for each placement
+    # - difficulty_grades table: mapping from numeric difficulty to V-grades
+    #
+    # The frames string is the core data — it encodes which holds are used
+    # and their roles, e.g., "p344r5p369r6p603r7" means:
+    #   placement 344 with role 5 (start)
+    #   placement 369 with role 6 (middle)
+    #   placement 603 with role 7 (finish)
+    print("\nLoading data from databases...")
+    df_climbs, df_placements = load_multi_board_data(configs, project_root=REPO_ROOT)
+    placement_lookup = make_placement_lookup(df_placements)
+
+    print(f"  Total climb-angle entries: {len(df_climbs):,}")
+    print(f"  Total placements: {len(df_placements):,}")
+    print(f"  Per board:")
+    for board_key in df_climbs["board_key"].unique():
+        n = (df_climbs["board_key"] == board_key).sum()
+        print(f"    {board_key}: {n:,} entries")
+
+    # ─────────────────────────────────────────────────────────────────────
+    # Step 3: Build tokenized route records
+    # ─────────────────────────────────────────────────────────────────────
+    # This is the core tokenization step. For each climb:
+    # 1. Parse the frames string into (placement_id, role_id) pairs
+    # 2. Map role IDs to semantic names using board config
+    # 3. Sort holds canonically: starts first, then middles by Y, etc.
+    # 4. Generate two token sequences:
+    #    - with_grade: <BOS> <BOARD_X> <ANGLE_Y> <GRADE_VZ> <holds...> <EOS>
+    #    - without_grade: <BOS> <BOARD_X> <ANGLE_Y> <holds...> <EOS>
+    #
+    # The grade-included version is for the GPT generator (which conditions
+    # on grade). The grade-excluded version is for the BERT-style predictor
+    # (which must predict grade, not see it).
+    print("\nBuilding tokenized route records...")
+    df_routes = build_route_records(
+        df_climbs=df_climbs,
+        configs_by_key=configs_by_key,
+        placement_lookup=placement_lookup,
+    )
+    if df_routes.empty:
+        raise RuntimeError("No routes were tokenized. Check raw DBs and board configs.")
+
+    print(f"  Tokenized routes: {len(df_routes):,}")
+    print(f"  Per board:")
+    for board_key in df_routes["board_key"].unique():
+        n = (df_routes["board_key"] == board_key).sum()
+        print(f"    {board_key}: {n:,} routes")
+
+    # ─────────────────────────────────────────────────────────────────────
+    # Step 4: Build the shared vocabulary
+    # ─────────────────────────────────────────────────────────────────────
+    # The vocabulary maps each unique token to an integer ID.
+    # This is analogous to how GPT-2's tokenizer maps subwords to IDs.
+    #
+    # Vocabulary structure:
+    # 1. Special tokens (IDs 0-5): <PAD>, <UNK>, <BOS>, <EOS>, <CLS>, <MASK>
+    # 2. Board tokens: <BOARD_TB2>, <BOARD_KILTER>
+    # 3. Angle tokens: <ANGLE_10>, <ANGLE_15>, ..., <ANGLE_55>
+    # 4. Grade tokens: <GRADE_V0>, <GRADE_V1>, ..., <GRADE_V16>
+    # 5. Hold tokens: <TB2_p344_start>, <KILTER_p1084_middle>, etc.
+    #
+    # Hold tokens are namespaced by board to prevent ID collisions.
+    # TB2 placement 344 and Kilter placement 344 are different physical holds.
+    print("\nBuilding vocabulary...")
+    vocab_tokens, stoi, itos = build_vocab(df_routes)
+
+    print(f"  Vocabulary size: {len(stoi):,}")
+    special_count = sum(1 for t in vocab_tokens if t in ["<PAD>", "<UNK>", "<BOS>", "<EOS>", "<CLS>", "<MASK>"])
+    board_count = sum(1 for t in vocab_tokens if t.startswith("<BOARD_"))
+    angle_count = sum(1 for t in vocab_tokens if t.startswith("<ANGLE_"))
+    grade_count = sum(1 for t in vocab_tokens if t.startswith("<GRADE_"))
+    hold_count = sum(1 for t in vocab_tokens if "_p" in t)
+    print(f"  Special tokens: {special_count}")
+    print(f"  Board tokens: {board_count}")
+    print(f"  Angle tokens: {angle_count}")
+    print(f"  Grade tokens: {grade_count}")
+    print(f"  Hold tokens: {hold_count}")
+
+    # ─────────────────────────────────────────────────────────────────────
+    # Step 5: Encode token sequences as integer IDs
+    # ─────────────────────────────────────────────────────────────────────
+    # Convert string tokens to integer IDs for model input.
+    # This is the same as encoding text with a tokenizer:
+    #   "The cat sat" → [464, 3797, 3290]
+    #   "<BOS> <BOARD_TB2> <TB2_p344_start>" → [2, 6, 42]
+    df_routes["ids_with_grade"] = df_routes["tokens_with_grade"].apply(lambda tokens: encode(tokens, stoi))
+    df_routes["ids_no_grade"] = df_routes["tokens_no_grade"].apply(lambda tokens: encode(tokens, stoi))
+
+    # ─────────────────────────────────────────────────────────────────────
+    # Step 6: Train/val/test split (stratified)
+    # ─────────────────────────────────────────────────────────────────────
+    # We split 80/10/10, stratified by board_key × grouped_v.
+    # This ensures both boards and all difficulty levels are represented
+    # in each split, which is critical for fair evaluation.
+    #
+    # Stratification prevents scenarios like "all V14 climbs end up in
+    # the test set while training has none."
+    df_routes["split_stratum"] = (
+        df_routes["board_key"].astype(str)
+        + "__V"
+        + df_routes["grouped_v"].astype(str)
+    )
+
+    train_df, temp_df = safe_train_test_split(
+        df_routes,
+        test_size=0.20,
+        random_state=args.seed,
+        stratify_col="split_stratum",
+    )
+    val_df, test_df = safe_train_test_split(
+        temp_df,
+        test_size=0.50,
+        random_state=args.seed,
+        stratify_col="split_stratum",
+    )
+
+    split_map = {}
+    split_map.update({uuid: "train" for uuid in train_df["uuid"]})
+    split_map.update({uuid: "val" for uuid in val_df["uuid"]})
+    split_map.update({uuid: "test" for uuid in test_df["uuid"]})
+    df_routes["split"] = df_routes["uuid"].map(split_map)
+
+    print(f"\nSplit counts:")
+    print(df_routes.groupby(["board_key", "split"]).size().unstack(fill_value=0))
+
+    # ─────────────────────────────────────────────────────────────────────
+    # Step 7: Build token metadata
+    # ─────────────────────────────────────────────────────────────────────
+    # Each token has associated metadata:
+    # - kind: "special", "board", "angle", "grade", or "hold"
+    # - For hold tokens: board_key, placement_id, role, x, y, x_norm, y_norm
+    # - For angle tokens: the angle value
+    # - For grade tokens: the V-grade value
+    #
+    # The coordinate features (x_norm, y_norm, is_hold) are injected into
+    # the grade predictor model as additional embeddings alongside token
+    # embeddings. This gives the model direct spatial information.
+    print("\nBuilding token metadata...")
+    df_token_meta = build_token_metadata(
+        vocab_tokens=vocab_tokens,
+        stoi=stoi,
+        df_placements=df_placements,
+        placement_lookup=placement_lookup,
+        configs_by_prefix=configs_by_prefix,
+    )
+    print(f"  Token metadata rows: {len(df_token_meta):,}")
+
+    # ─────────────────────────────────────────────────────────────────────
+    # Step 8: Save all artifacts
+    # ─────────────────────────────────────────────────────────────────────
+    # Save multiple file formats for different use cases:
+    # - CSV: Easy to load in pandas for analysis
+    # - JSONL: Easy to stream for training
+    # - JSON: Vocabulary mapping for model loading
+    print("\nSaving artifacts...")
+    jsonl_path = args.out_dir / "routes_tokenized.jsonl"
+    with jsonl_path.open("w", encoding="utf-8") as handle:
+        for record in df_routes.to_dict(orient="records"):
+            handle.write(json.dumps(json_safe(record)) + "\n")
+
+    csv_cols = [
+        "uuid", "board_key", "board_display_name", "board_token_prefix", "board_token",
+        "climb_name", "setter_username", "layout_id", "layout_name", "board_name",
+        "frames", "angle", "display_difficulty", "grouped_v", "boulder_grade",
+        "ascensionist_count", "quality_average", "fa_at",
+        "n_holds", "n_start", "n_middle", "n_foot", "n_finish",
+        "sequence_with_grade", "sequence_no_grade", "split",
+    ]
+    df_routes[csv_cols].to_csv(args.out_dir / "route_sequences.csv", index=False)
+    df_placements.to_csv(args.out_dir / "placement_metadata.csv", index=False)
+    df_token_meta.to_csv(args.out_dir / "token_metadata.csv", index=False)
+    write_json(args.out_dir / "token_vocab.json", vocab_payload(stoi, itos, configs_by_key))
+
+    # Board summary statistics
+    board_summary = (
+        df_routes.groupby("board_key")
+        .agg(
+            n_routes=("uuid", "count"),
+            mean_angle=("angle", "mean"),
+            mean_display_difficulty=("display_difficulty", "mean"),
+            mean_holds=("n_holds", "mean"),
+        )
+        .reset_index()
+    )
+    board_summary.to_csv(args.out_dir / "board_summary.csv", index=False)
+
+    print(f"\n{'='*60}")
+    print(f"Tokenization complete!")
+    print(f"{'='*60}")
+    print(f"Boards: {board_keys}")
+    print(f"Tokenized routes: {len(df_routes):,}")
+    print(f"Vocabulary size: {len(stoi):,}")
+    print(f"Split counts:")
+    print(df_routes.groupby(["board_key", "split"]).size().unstack(fill_value=0))
+    print(f"\nSaved artifacts to: {args.out_dir}")
+    for f in sorted(args.out_dir.iterdir()):
+        size_mb = f.stat().st_size / 1e6
+        print(f"  {f.name} ({size_mb:.1f} MB)")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,393 @@
+#!/usr/bin/env python3
+"""
+ClimbingBoardGPT — Grade Prediction Training Script
+
+This script trains a BERT-style transformer encoder to predict climb difficulty
+from tokenized route sequences.
+
+Architecture Overview:
+----------------------
+The model is a Transformer Encoder (similar to BERT) with a regression head:
+
+    Input: <CLS> <BOARD_TB2> <ANGLE_40> <TB2_p344_start> ... <TB2_p603_finish>
+              ↓
+    Token Embedding + Position Embedding + Coordinate Features
+              ↓
+    Transformer Encoder (4 layers, 4 heads, d_model=128)
+              ↓
+    <CLS> token output (pooled representation of the entire sequence)
+              ↓
+    MLP Head → single scalar (predicted difficulty)
+
+Key Concepts:
+1. <CLS> pooling: The <CLS> token aggregates information from the entire
+   sequence via self-attention. This is the standard BERT approach for
+   sequence-level tasks.
+
+2. Coordinate features: Each hold token has physical (x, y) position
+   information that gets projected and added to the embedding. This gives
+   the model direct spatial knowledge without needing to learn it from data.
+
+3. No grade token in input: The grade predictor must PREDICT the grade,
+   not see it. We use the "no_grade" token sequence.
+
+4. MSE loss: Since we're predicting a continuous value (difficulty score),
+   we use Mean Squared Error loss rather than cross-entropy.
+
+5. Joint training: Both TB2 and Kilter routes are trained together,
+   with <BOARD_TB2> / <BOARD_KILTER> tokens telling the model which
+   board it's operating on.
+
+Usage:
+    python scripts/02_train_grade_predictor.py
+    python scripts/02_train_grade_predictor.py --epochs 100 --lr 1e-4
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(REPO_ROOT / "src"))
+
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader
+
+from climbingboardgpt.datasets import RouteGradeDataset
+from climbingboardgpt.grades import to_grouped_v
+from climbingboardgpt.metrics import metrics_by_board, print_metrics, regression_metrics
+from climbingboardgpt.models import JointRouteTransformerRegressor
+from climbingboardgpt.utils import set_seed, write_json
+
+
+def parse_args() -> argparse.Namespace:
+    """Parse command-line arguments for grade predictor training.
+    
+    Key hyperparameters:
+        --epochs: Maximum training epochs (default: 75)
+        --patience: Early stopping patience (default: 12)
+        --batch-size: Batch size for training (default: 128)
+        --lr: Learning rate (default: 3e-4)
+        --d-model: Transformer embedding dimension (default: 128)
+        --nhead: Number of attention heads (default: 4)
+        --num-layers: Number of transformer layers (default: 4)
+    """
+    parser = argparse.ArgumentParser(
+        description="Train a joint TB2/Kilter transformer grade predictor.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+The model predicts display_difficulty (a continuous value) from tokenized
+route sequences. Evaluation metrics include MAE, RMSE, R², and V-grade
+accuracy (within ±1 V-grade).
+        """,
+    )
+    parser.add_argument("--tokenized-dir", type=Path, default=REPO_ROOT / "data" / "processed" / "tokenized")
+    parser.add_argument("--out-dir", type=Path, default=REPO_ROOT / "data" / "processed" / "grade_prediction")
+    parser.add_argument("--model-dir", type=Path, default=REPO_ROOT / "models")
+    parser.add_argument("--epochs", type=int, default=75, help="Maximum training epochs")
+    parser.add_argument("--patience", type=int, default=12, help="Early stopping patience")
+    parser.add_argument("--batch-size", type=int, default=128, help="Training batch size")
+    parser.add_argument("--lr", type=float, default=3e-4, help="Learning rate")
+    parser.add_argument("--weight-decay", type=float, default=1e-2, help="AdamW weight decay")
+    parser.add_argument("--d-model", type=int, default=128, help="Transformer embedding dimension")
+    parser.add_argument("--nhead", type=int, default=4, help="Number of attention heads")
+    parser.add_argument("--num-layers", type=int, default=4, help="Number of transformer layers")
+    parser.add_argument("--dim-feedforward", type=int, default=256, help="Feedforward dimension")
+    parser.add_argument("--dropout", type=float, default=0.10, help="Dropout probability")
+    parser.add_argument("--seed", type=int, default=42, help="Random seed")
+    parser.add_argument("--device", type=str, default=None, help="Device (cpu or cuda)")
+    return parser.parse_args()
+
+
+def build_coord_features(df_token_meta: pd.DataFrame, vocab_size: int) -> torch.Tensor:
+    """Build coordinate feature matrix for the transformer model.
+    
+    Each token gets a 3-dimensional feature vector:
+    - x_norm: Normalized horizontal position on the board (-1 to 1)
+    - y_norm: Normalized vertical position on the board (-1 to 1)
+    - is_hold: 1 if this token represents a hold, 0 otherwise
+    
+    These features are projected through a linear layer and added to
+    the token embeddings, giving the model direct spatial information.
+    This is analogous to how some vision-language models inject spatial
+    features from images alongside text tokens.
+    
+    Args:
+        df_token_meta: DataFrame with token metadata
+        vocab_size: Total vocabulary size
+        
+    Returns:
+        Tensor of shape (vocab_size, 3) with coordinate features
+    """
+    features = np.zeros((vocab_size, 3), dtype=np.float32)
+    for _, row in df_token_meta.iterrows():
+        token_id = int(row["token_id"])
+        features[token_id, 0] = 0.0 if pd.isna(row.get("x_norm", 0.0)) else float(row.get("x_norm", 0.0))
+        features[token_id, 1] = 0.0 if pd.isna(row.get("y_norm", 0.0)) else float(row.get("y_norm", 0.0))
+        features[token_id, 2] = 0.0 if pd.isna(row.get("is_hold", 0.0)) else float(row.get("is_hold", 0.0))
+    return torch.tensor(features, dtype=torch.float32)
+
+
+def run_epoch(model, loader, device, optimizer=None):
+    """Run one epoch of training or evaluation.
+    
+    Args:
+        model: The transformer model
+        loader: DataLoader for this epoch
+        device: torch device (cpu or cuda)
+        optimizer: If provided, run training (with gradient updates).
+                   If None, run evaluation (no gradient updates).
+    
+    Returns:
+        Tuple of (average_loss, predictions, targets, uuids, board_keys)
+    """
+    is_train = optimizer is not None
+    model.train(is_train)
+    criterion = nn.MSELoss()
+
+    losses, preds, targets, uuids, boards = [], [], [], [], []
+
+    for batch in loader:
+        input_ids = batch["input_ids"].to(device)
+        attention_mask = batch["attention_mask"].to(device)
+        target = batch["target"].to(device)
+
+        if is_train:
+            optimizer.zero_grad(set_to_none=True)
+
+        # Forward pass: model predicts difficulty from token sequence
+        pred = model(input_ids, attention_mask)
+        loss = criterion(pred, target)
+
+        if is_train:
+            # Backward pass: compute gradients and update weights
+            loss.backward()
+            # Gradient clipping prevents exploding gradients
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            optimizer.step()
+
+        losses.append(loss.item() * input_ids.size(0))
+        preds.extend(pred.detach().cpu().numpy().tolist())
+        targets.extend(target.detach().cpu().numpy().tolist())
+        uuids.extend(batch["uuid"])
+        boards.extend(batch["board_key"])
+
+    avg_loss = sum(losses) / max(1, len(loader.dataset))
+    return avg_loss, np.asarray(preds), np.asarray(targets), uuids, boards
+
+
+def main() -> None:
+    """Main training loop for the grade predictor.
+    
+    Steps:
+    1. Load tokenized data and vocabulary
+    2. Prepare input sequences (with <CLS> token, without grade)
+    3. Build coordinate features matrix
+    4. Create train/val/test DataLoaders
+    5. Initialize transformer model
+    6. Train with early stopping
+    7. Evaluate on test set
+    8. Save model checkpoint and metrics
+    """
+    args = parse_args()
+    set_seed(args.seed)
+    args.out_dir.mkdir(parents=True, exist_ok=True)
+    args.model_dir.mkdir(parents=True, exist_ok=True)
+
+    # ─────────────────────────────────────────────────────────────────────
+    # Step 1: Load tokenized data
+    # ─────────────────────────────────────────────────────────────────────
+    seq_path = args.tokenized_dir / "route_sequences.csv"
+    vocab_path = args.tokenized_dir / "token_vocab.json"
+    meta_path = args.tokenized_dir / "token_metadata.csv"
+    if not seq_path.exists() or not vocab_path.exists() or not meta_path.exists():
+        raise FileNotFoundError("Missing tokenized artifacts. Run scripts/01_tokenize_routes.py first.")
+
+    df_routes = pd.read_csv(seq_path)
+    vocab = json.loads(vocab_path.read_text(encoding="utf-8"))
+    stoi = {str(k): int(v) for k, v in vocab["stoi"].items()}
+    itos = {int(k): str(v) for k, v in vocab["itos"].items()}
+    df_token_meta = pd.read_csv(meta_path)
+
+    pad_id = stoi["<PAD>"]
+    unk_id = stoi["<UNK>"]
+
+    # ─────────────────────────────────────────────────────────────────────
+    # Step 2: Prepare input sequences
+    # ─────────────────────────────────────────────────────────────────────
+    # For grade prediction, we use the "no_grade" version of the sequence
+    # and prepend <CLS> for sequence-level pooling.
+    # The model must PREDICT the grade, not see it in the input!
+    def encode(tokens):
+        return [stoi.get(token, unk_id) for token in tokens]
+
+    df_routes["tokens_no_grade"] = df_routes["sequence_no_grade"].fillna("").str.split()
+    df_routes["model_tokens"] = df_routes["tokens_no_grade"].apply(
+        lambda tokens: ["<CLS>"] + tokens[1:] if tokens else ["<CLS>"]
+    )
+    df_routes["model_ids"] = df_routes["model_tokens"].apply(encode)
+    df_routes["seq_len"] = df_routes["model_ids"].apply(len)
+    max_len = int(df_routes["seq_len"].max())
+
+    # ─────────────────────────────────────────────────────────────────────
+    # Step 3: Create DataLoaders
+    # ─────────────────────────────────────────────────────────────────────
+    train_df = df_routes[df_routes["split"] == "train"].reset_index(drop=True)
+    val_df = df_routes[df_routes["split"] == "val"].reset_index(drop=True)
+    test_df = df_routes[df_routes["split"] == "test"].reset_index(drop=True)
+
+    train_ds = RouteGradeDataset(train_df, max_len=max_len, pad_id=pad_id)
+    val_ds = RouteGradeDataset(val_df, max_len=max_len, pad_id=pad_id)
+    test_ds = RouteGradeDataset(test_df, max_len=max_len, pad_id=pad_id)
+
+    train_loader = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True)
+    val_loader = DataLoader(val_ds, batch_size=args.batch_size, shuffle=False)
+    test_loader = DataLoader(test_ds, batch_size=args.batch_size, shuffle=False)
+
+    # ─────────────────────────────────────────────────────────────────────
+    # Step 4: Initialize model
+    # ─────────────────────────────────────────────────────────────────────
+    device = torch.device(args.device or ("cuda" if torch.cuda.is_available() else "cpu"))
+    coord_features = build_coord_features(df_token_meta, vocab_size=len(stoi))
+
+    model = JointRouteTransformerRegressor(
+        vocab_size=len(stoi),
+        max_len=max_len,
+        coord_features=coord_features,
+        d_model=args.d_model,
+        nhead=args.nhead,
+        num_layers=args.num_layers,
+        dim_feedforward=args.dim_feedforward,
+        dropout=args.dropout,
+        pad_id=pad_id,
+    ).to(device)
+
+    optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
+
+    print(f"Device: {device}")
+    print(f"Train/val/test: {len(train_ds):,}, {len(val_ds):,}, {len(test_ds):,}")
+    print(f"Vocabulary size: {len(stoi):,}")
+    print(f"Max sequence length: {max_len}")
+    print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")
+
+    # ─────────────────────────────────────────────────────────────────────
+    # Step 5: Training loop with early stopping
+    # ─────────────────────────────────────────────────────────────────────
+    history = []
+    best_val_mae = float("inf")
+    best_state = None
+    best_epoch = 0
+    epochs_without_improvement = 0
+
+    print("\nStarting training...")
+    for epoch in range(1, args.epochs + 1):
+        train_loss, train_pred, train_true, _, _ = run_epoch(model, train_loader, device, optimizer)
+        val_loss, val_pred, val_true, _, _ = run_epoch(model, val_loader, device, optimizer=None)
+
+        train_metrics = regression_metrics(train_true, train_pred)
+        val_metrics = regression_metrics(val_true, val_pred)
+
+        history.append({
+            "epoch": epoch,
+            "train_loss": train_loss,
+            "val_loss": val_loss,
+            "train_mae": train_metrics["mae"],
+            "val_mae": val_metrics["mae"],
+            "train_r2": train_metrics["r2"],
+            "val_r2": val_metrics["r2"],
+            "val_within_1_vgrade": val_metrics["within_1_vgrade"],
+        })
+
+        # Track best model by validation MAE
+        if val_metrics["mae"] < best_val_mae:
+            best_val_mae = val_metrics["mae"]
+            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
+            best_epoch = epoch
+            epochs_without_improvement = 0
+        else:
+            epochs_without_improvement += 1
+
+        if epoch == 1 or epoch % 5 == 0 or epoch == best_epoch:
+            print(
+                f"Epoch {epoch:03d} | "
+                f"train MAE {train_metrics['mae']:.3f} | "
+                f"val MAE {val_metrics['mae']:.3f} | "
+                f"val R² {val_metrics['r2']:.3f} | "
+                f"val ±1V {val_metrics['within_1_vgrade']:.1f}%"
+            )
+
+        if epochs_without_improvement >= args.patience:
+            print(f"Early stopping at epoch {epoch}; best epoch was {best_epoch}.")
+            break
+
+    # Load best model
+    if best_state is not None:
+        model.load_state_dict(best_state)
+
+    # ─────────────────────────────────────────────────────────────────────
+    # Step 6: Test set evaluation
+    # ─────────────────────────────────────────────────────────────────────
+    test_loss, test_pred, test_true, test_uuid, test_board = run_epoch(model, test_loader, device, optimizer=None)
+    overall_metrics = regression_metrics(test_true, test_pred)
+
+    pred_df = pd.DataFrame({
+        "uuid": test_uuid,
+        "board_key": test_board,
+        "y_true": test_true,
+        "y_pred": test_pred,
+        "abs_error": np.abs(test_true - test_pred),
+        "true_v": [to_grouped_v(value) for value in test_true],
+        "pred_v": [to_grouped_v(value) for value in test_pred],
+    })
+    pred_df = pred_df.merge(
+        df_routes[["uuid", "climb_name", "angle", "boulder_grade", "sequence_no_grade"]],
+        on="uuid",
+        how="left",
+    )
+    board_metrics_df = metrics_by_board(pred_df)
+
+    print_metrics("Overall joint test performance", overall_metrics)
+    print("\nBoard-specific test performance:")
+    print(board_metrics_df.to_string(index=False))
+
+    # ─────────────────────────────────────────────────────────────────────
+    # Step 7: Save artifacts
+    # ─────────────────────────────────────────────────────────────────────
+    pd.DataFrame(history).to_csv(args.out_dir / "training_history.csv", index=False)
+    pred_df.to_csv(args.out_dir / "test_predictions.csv", index=False)
+    board_metrics_df.to_csv(args.out_dir / "board_metrics.csv", index=False)
+    write_json(args.out_dir / "overall_metrics.json", overall_metrics)
+
+    # Save model checkpoint with all necessary info for loading
+    checkpoint = {
+        "model_state_dict": model.state_dict(),
+        "config": {
+            "vocab_size": len(stoi),
+            "max_len": max_len,
+            "d_model": args.d_model,
+            "nhead": args.nhead,
+            "num_layers": args.num_layers,
+            "dim_feedforward": args.dim_feedforward,
+            "dropout": args.dropout,
+            "pad_id": pad_id,
+        },
+        "stoi": stoi,
+        "itos": {str(k): v for k, v in itos.items()},
+        "coord_features": coord_features.cpu(),
+        "overall_metrics": overall_metrics,
+    }
+    model_path = args.model_dir / "joint_transformer_grade_predictor.pth"
+    torch.save(checkpoint, model_path)
+
+    print("\nSaved:")
+    print(f"  {args.out_dir}")
+    print(f"  {model_path}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,388 @@
+#!/usr/bin/env python3
+"""
+ClimbingBoardGPT — Route Generation Training Script
+
+This script trains a GPT-style causal transformer to generate new climbing
+routes conditioned on board type, angle, and target grade.
+
+Architecture Overview:
+----------------------
+The model is a causal (autoregressive) transformer decoder:
+
+    Input: <BOS> <BOARD_TB2> <ANGLE_40> <GRADE_V6> <TB2_p344_start> ...
+              ↓
+    Token Embedding + Position Embedding
+              ↓
+    Causal Transformer (4 layers, 4 heads, d_embd=128)
+    [Each position can only attend to previous positions]
+              ↓
+    Language Modeling Head → next token logits
+              ↓
+    Sample next token → append to sequence → repeat
+
+Key Concepts:
+1. Causal masking: Unlike BERT which sees all tokens, GPT can only
+   attend to previous tokens. This enables autoregressive generation.
+
+2. Teacher forcing: During training, we feed the ground-truth previous
+   token. During generation, we feed the model's own prediction.
+
+3. Weight tying: The output projection shares weights with the input
+   embedding. This reduces parameters and improves training stability.
+
+4. Temperature & top-k sampling: Control generation diversity.
+   - Low temperature (0.3) → conservative, realistic routes
+   - High temperature (1.5) → creative, unusual routes
+   - Top-k (default 50) → only consider the 50 most likely next tokens
+
+5. Conditioning: The prompt tokens (<BOARD_...>, <ANGLE_...>, <GRADE_...>)
+   tell the model what kind of route to generate, similar to how
+   ChatGPT uses system prompts.
+
+Usage:
+    python scripts/03_train_route_generator.py
+    python scripts/03_train_route_generator.py --epochs 100 --temperature 0.7
+    python scripts/03_train_route_generator.py --generate-board tb2 --generate-grades 3,5,7
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import math
+import sys
+from pathlib import Path
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(REPO_ROOT / "src"))
+
+import pandas as pd
+import torch
+from torch.utils.data import DataLoader
+
+from climbingboardgpt.config import load_board_configs, parse_board_keys
+from climbingboardgpt.datasets import RouteGPTDataset
+from climbingboardgpt.generation import generate_one
+from climbingboardgpt.models import JointRouteGPT
+from climbingboardgpt.utils import set_seed
+
+
+def csv_ints(value: str | None) -> list[int] | None:
+    """Parse a comma-separated string of integers, or return None."""
+    if value is None or not value.strip():
+        return None
+    return [int(part.strip()) for part in value.split(",") if part.strip()]
+
+
+def parse_args() -> argparse.Namespace:
+    """Parse command-line arguments for route generator training."""
+    parser = argparse.ArgumentParser(
+        description="Train a joint TB2/Kilter GPT-style route generator.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+After training, the script generates sample routes for each board at
+common angles and grades. Use --generate-board to generate for a
+specific board, or leave unset to generate for all boards.
+        """,
+    )
+    parser.add_argument("--tokenized-dir", type=Path, default=REPO_ROOT / "data" / "processed" / "tokenized")
+    parser.add_argument("--out-dir", type=Path, default=REPO_ROOT / "data" / "processed" / "generation")
+    parser.add_argument("--model-dir", type=Path, default=REPO_ROOT / "models")
+    parser.add_argument("--boards", type=str, default="tb2,kilter", help="Board configs for role reconstruction")
+    parser.add_argument("--epochs", type=int, default=60, help="Maximum training epochs")
+    parser.add_argument("--patience", type=int, default=10, help="Early stopping patience")
+    parser.add_argument("--batch-size", type=int, default=128, help="Training batch size")
+    parser.add_argument("--lr", type=float, default=3e-4, help="Learning rate")
+    parser.add_argument("--weight-decay", type=float, default=1e-2, help="AdamW weight decay")
+    parser.add_argument("--n-embd", type=int, default=128, help="Embedding dimension")
+    parser.add_argument("--n-head", type=int, default=4, help="Number of attention heads")
+    parser.add_argument("--n-layer", type=int, default=4, help="Number of transformer layers")
+    parser.add_argument("--dropout", type=float, default=0.10, help="Dropout probability")
+    parser.add_argument("--temperature", type=float, default=0.9, help="Sampling temperature")
+    parser.add_argument("--top-k", type=int, default=50, help="Top-k sampling parameter")
+    parser.add_argument("--max-new-tokens", type=int, default=40, help="Max tokens to generate")
+    parser.add_argument("--n-per-condition", type=int, default=10, help="Routes to generate per condition")
+    parser.add_argument("--generate-board", type=str, default=None, help="Board key: tb2 or kilter")
+    parser.add_argument("--generate-angles", type=str, default=None, help="Comma-separated angles")
+    parser.add_argument("--generate-grades", type=str, default=None, help="Comma-separated V-grades")
+    parser.add_argument("--seed", type=int, default=42, help="Random seed")
+    parser.add_argument("--device", type=str, default=None, help="Device (cpu or cuda)")
+    return parser.parse_args()
+
+
+def evaluate_loss(model, loader, device) -> float:
+    """Evaluate the model on a data loader, returning average loss.
+    
+    This is used for validation and test evaluation. The model is set to
+    eval mode and no gradients are computed.
+    """
+    model.eval()
+    losses = []
+    n = 0
+    with torch.no_grad():
+        for batch in loader:
+            x = batch["input_ids"].to(device)
+            y = batch["target_ids"].to(device)
+            _, loss = model(x, y)
+            batch_size = x.size(0)
+            losses.append(loss.item() * batch_size)
+            n += batch_size
+    return sum(losses) / max(1, n)
+
+
+def train_one_epoch(model, loader, optimizer, device) -> float:
+    """Train for one epoch, returning average loss.
+    
+    Uses teacher forcing: the model receives ground-truth previous tokens
+    and predicts the next token. This is standard for language model training.
+    """
+    model.train()
+    losses = []
+    n = 0
+    for batch in loader:
+        x = batch["input_ids"].to(device)
+        y = batch["target_ids"].to(device)
+
+        optimizer.zero_grad(set_to_none=True)
+        _, loss = model(x, y)
+        loss.backward()
+        # Gradient clipping prevents exploding gradients
+        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+        optimizer.step()
+
+        batch_size = x.size(0)
+        losses.append(loss.item() * batch_size)
+        n += batch_size
+    return sum(losses) / max(1, n)
+
+
+def main() -> None:
+    """Main training and generation loop.
+    
+    Steps:
+    1. Load tokenized data and vocabulary
+    2. Prepare input/target pairs for causal language modeling
+    3. Create train/val DataLoaders
+    4. Initialize GPT model
+    5. Train with early stopping
+    6. Generate sample routes for evaluation
+    7. Save model checkpoint and generated routes
+    """
+    args = parse_args()
+    set_seed(args.seed)
+    args.out_dir.mkdir(parents=True, exist_ok=True)
+    args.model_dir.mkdir(parents=True, exist_ok=True)
+
+    # ─────────────────────────────────────────────────────────────────────
+    # Step 1: Load data
+    # ─────────────────────────────────────────────────────────────────────
+    seq_path = args.tokenized_dir / "route_sequences.csv"
+    vocab_path = args.tokenized_dir / "token_vocab.json"
+    if not seq_path.exists() or not vocab_path.exists():
+        raise FileNotFoundError("Missing tokenized artifacts. Run scripts/01_tokenize_routes.py first.")
+
+    df_routes = pd.read_csv(seq_path)
+    vocab = json.loads(vocab_path.read_text(encoding="utf-8"))
+    stoi = {str(k): int(v) for k, v in vocab["stoi"].items()}
+    itos = {int(k): str(v) for k, v in vocab["itos"].items()}
+    pad_id = stoi["<PAD>"]
+    unk_id = stoi["<UNK>"]
+
+    # ─────────────────────────────────────────────────────────────────────
+    # Step 2: Prepare sequences for causal language modeling
+    # ─────────────────────────────────────────────────────────────────────
+    # For GPT training, we use the "with grade" version because the model
+    # needs to learn the relationship between grade and hold selection.
+    #
+    # Input:  <BOS> <BOARD_TB2> <ANGLE_40> <GRADE_V6> <TB2_p344_start> ...
+    # Target: <BOARD_TB2> <ANGLE_40> <GRADE_V6> <TB2_p344_start> <TB2_p369_middle> ...
+    #
+    # The input is shifted right by one position compared to the target.
+    # This is the standard causal language modeling setup.
+    def encode(tokens):
+        return [stoi.get(token, unk_id) for token in tokens]
+
+    df_routes["gpt_tokens"] = df_routes["sequence_with_grade"].fillna("").str.split()
+    df_routes["gpt_ids"] = df_routes["gpt_tokens"].apply(encode)
+    df_routes["seq_len"] = df_routes["gpt_ids"].apply(len)
+    max_len = int(df_routes["seq_len"].max())
+    if max_len < 2:
+        raise RuntimeError("Token sequences are too short to train the causal model.")
+    block_size = max_len - 1  # Input length (one less than full sequence)
+
+    # ─────────────────────────────────────────────────────────────────────
+    # Step 3: Create DataLoaders
+    # ─────────────────────────────────────────────────────────────────────
+    train_df = df_routes[df_routes["split"] == "train"].reset_index(drop=True)
+    val_df = df_routes[df_routes["split"] == "val"].reset_index(drop=True)
+    test_df = df_routes[df_routes["split"] == "test"].reset_index(drop=True)
+
+    train_ds = RouteGPTDataset(train_df, max_len=max_len, pad_id=pad_id)
+    val_ds = RouteGPTDataset(val_df, max_len=max_len, pad_id=pad_id)
+    test_ds = RouteGPTDataset(test_df, max_len=max_len, pad_id=pad_id)
+
+    train_loader = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True)
+    val_loader = DataLoader(val_ds, batch_size=args.batch_size, shuffle=False)
+    test_loader = DataLoader(test_ds, batch_size=args.batch_size, shuffle=False)
+
+    # ─────────────────────────────────────────────────────────────────────
+    # Step 4: Initialize model
+    # ─────────────────────────────────────────────────────────────────────
+    device = torch.device(args.device or ("cuda" if torch.cuda.is_available() else "cpu"))
+    model = JointRouteGPT(
+        vocab_size=len(stoi),
+        block_size=block_size,
+        n_embd=args.n_embd,
+        n_head=args.n_head,
+        n_layer=args.n_layer,
+        dropout=args.dropout,
+        pad_id=pad_id,
+    ).to(device)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
+
+    print(f"Device: {device}")
+    print(f"Train/val/test: {len(train_ds):,}, {len(val_ds):,}, {len(test_ds):,}")
+    print(f"Vocabulary size: {len(stoi):,}")
+    print(f"Block size: {block_size}")
+    print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")
+
+    # ─────────────────────────────────────────────────────────────────────
+    # Step 5: Training loop with early stopping
+    # ─────────────────────────────────────────────────────────────────────
+    # We track perplexity (exp(loss)) as well as raw loss.
+    # Perplexity answers: "On average, how many tokens was the model
+    # choosing between at each step?"
+    # Lower perplexity = better model.
+    history = []
+    best_val_loss = float("inf")
+    best_state = None
+    best_epoch = 0
+    epochs_without_improvement = 0
+
+    print("\nStarting GPT training...")
+    for epoch in range(1, args.epochs + 1):
+        train_loss = train_one_epoch(model, train_loader, optimizer, device)
+        val_loss = evaluate_loss(model, val_loader, device)
+        history.append({
+            "epoch": epoch,
+            "train_loss": train_loss,
+            "val_loss": val_loss,
+            "train_perplexity": math.exp(min(train_loss, 20)),
+            "val_perplexity": math.exp(min(val_loss, 20)),
+        })
+
+        if val_loss < best_val_loss:
+            best_val_loss = val_loss
+            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
+            best_epoch = epoch
+            epochs_without_improvement = 0
+        else:
+            epochs_without_improvement += 1
+
+        if epoch == 1 or epoch % 5 == 0 or epoch == best_epoch:
+            print(
+                f"Epoch {epoch:03d} | "
+                f"train loss {train_loss:.3f} | "
+                f"val loss {val_loss:.3f} | "
+                f"val ppl {math.exp(min(val_loss, 20)):.1f}"
+            )
+
+        if epochs_without_improvement >= args.patience:
+            print(f"Early stopping at epoch {epoch}; best epoch was {best_epoch}.")
+            break
+
+    if best_state is not None:
+        model.load_state_dict(best_state)
+
+    # ─────────────────────────────────────────────────────────────────────
+    # Step 6: Test evaluation
+    # ─────────────────────────────────────────────────────────────────────
+    test_loss = evaluate_loss(model, test_loader, device)
+    print(f"\nBest validation loss: {best_val_loss:.4f}")
+    print(f"Test loss: {test_loss:.4f}")
+    print(f"Test perplexity: {math.exp(min(test_loss, 20)):.1f}")
+
+    # ─────────────────────────────────────────────────────────────────────
+    # Step 7: Generate sample routes
+    # ─────────────────────────────────────────────────────────────────────
+    # For each board, generate routes at common angles and grades.
+    # This demonstrates the model's ability to produce novel routes
+    # conditioned on board, angle, and difficulty.
+    configs = load_board_configs(parse_board_keys(args.boards))
+    configs_by_key = {config.board_key: config for config in configs}
+
+    board_keys_to_generate = [args.generate_board] if args.generate_board else sorted(df_routes["board_key"].unique())
+    requested_angles = csv_ints(args.generate_angles)
+    requested_grades = csv_ints(args.generate_grades)
+
+    generated = []
+    for board_key in board_keys_to_generate:
+        board_frame = df_routes[df_routes["board_key"] == board_key]
+        if board_frame.empty:
+            continue
+        config = configs_by_key[board_key]
+        # Use common angles if none specified
+        angles = requested_angles or (
+            board_frame["angle"].astype(int).value_counts().head(5).index.sort_values().tolist()
+        )
+        # Use common grades if none specified
+        grades = requested_grades or (
+            board_frame["grouped_v"].astype(int).value_counts().head(8).index.sort_values().tolist()
+        )
+        for angle in angles:
+            for grade in grades:
+                for _ in range(args.n_per_condition):
+                    generated.append({
+                        "board_key": board_key,
+                        **generate_one(
+                            model=model,
+                            stoi=stoi,
+                            itos=itos,
+                            device=device,
+                            board_prefix=config.token_prefix,
+                            angle=int(angle),
+                            grouped_v=int(grade),
+                            role_name_to_id=config.role_definitions,
+                            temperature=args.temperature,
+                            top_k=args.top_k,
+                            max_new_tokens=args.max_new_tokens,
+                        ),
+                    })
+
+    generated_df = pd.DataFrame(generated)
+    if not generated_df.empty:
+        print(f"\nGenerated routes: {len(generated_df):,}")
+        print("Basic validity by board:")
+        print(generated_df.groupby("board_key")["basic_valid"].mean())
+
+    # ─────────────────────────────────────────────────────────────────────
+    # Step 8: Save artifacts
+    # ─────────────────────────────────────────────────────────────────────
+    pd.DataFrame(history).to_csv(args.out_dir / "training_history.csv", index=False)
+    generated_df.to_csv(args.out_dir / "generated_routes.csv", index=False)
+
+    checkpoint = {
+        "model_state_dict": model.state_dict(),
+        "config": {
+            "vocab_size": len(stoi),
+            "block_size": block_size,
+            "n_embd": args.n_embd,
+            "n_head": args.n_head,
+            "n_layer": args.n_layer,
+            "dropout": args.dropout,
+            "pad_id": pad_id,
+        },
+        "stoi": stoi,
+        "itos": {str(k): v for k, v in itos.items()},
+        "best_val_loss": best_val_loss,
+        "test_loss": test_loss,
+    }
+    model_path = args.model_dir / "joint_route_gpt_generator.pth"
+    torch.save(checkpoint, model_path)
+
+    print("\nSaved:")
+    print(f"  {args.out_dir}")
+    print(f"  {model_path}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,328 @@
+#!/usr/bin/env python3
+"""
+ClimbingBoardGPT — Generated Route Evaluation Script
+
+This script evaluates routes generated by the GPT model on four dimensions:
+
+1. Validity: Does the route follow structural rules?
+   - At least 3 holds
+   - No duplicate placements
+   - At least one start and one finish hold
+   - All holds from the same board
+
+2. Novelty: Is the route different from existing climbs?
+   - Measured by Jaccard distance from the nearest real route
+
+3. Geometric plausibility: Are holds in reasonable positions?
+   - Height, width, mean hand reach distance
+
+4. Grade consistency: Does the route's predicted grade match the request?
+   - Uses the trained grade predictor as a "critic"
+
+This is analogous to how language models are evaluated using BLEU, ROUGE,
+or human evaluation — but adapted for the climbing domain.
+
+Usage:
+    python scripts/04_evaluate_generated_routes.py
+    python scripts/04_evaluate_generated_routes.py --grade-model-path models/joint_transformer_grade_predictor.pth
+"""
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(REPO_ROOT / "src"))
+
+import numpy as np
+import pandas as pd
+import torch
+
+from climbingboardgpt.evaluation import (
+    build_placement_coords,
+    frames_to_holds,
+    holds_to_placement_set,
+    nearest_real_route_same_board,
+    parse_token_list,
+    simple_route_features,
+    tokens_to_hold_records,
+    validity_from_records,
+)
+from climbingboardgpt.grades import to_grouped_v
+from climbingboardgpt.models import JointRouteTransformerRegressor
+
+
+def parse_args() -> argparse.Namespace:
+    """Parse command-line arguments for route evaluation."""
+    parser = argparse.ArgumentParser(
+        description="Evaluate generated TB2/Kilter route candidates.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument("--tokenized-dir", type=Path, default=REPO_ROOT / "data" / "processed" / "tokenized")
+    parser.add_argument("--generated-dir", type=Path, default=REPO_ROOT / "data" / "processed" / "generation")
+    parser.add_argument("--out-dir", type=Path, default=REPO_ROOT / "data" / "processed" / "evaluation")
+    parser.add_argument("--grade-model-path", type=Path, default=REPO_ROOT / "models" / "joint_transformer_grade_predictor.pth")
+    parser.add_argument("--device", type=str, default=None)
+    return parser.parse_args()
+
+
+def load_grade_critic(model_path: Path, device: torch.device):
+    """Load the trained grade predictor model as a critic.
+    
+    The critic is used to predict the difficulty of generated routes.
+    If we asked for V6 and the critic predicts V6 ± 1, the generation
+    is grade-consistent.
+    
+    This is similar to how GANs use a discriminator, except our critic
+    is a regression model rather than a binary classifier.
+    
+    Args:
+        model_path: Path to the saved model checkpoint
+        device: torch device
+        
+    Returns:
+        Dictionary with model, vocabulary, and config, or None if not found
+    """
+    if not model_path.exists():
+        return None
+    try:
+        checkpoint = torch.load(model_path, map_location=device, weights_only=False)
+    except TypeError:
+        checkpoint = torch.load(model_path, map_location=device)
+
+    cfg = checkpoint["config"]
+    stoi = {str(k): int(v) for k, v in checkpoint["stoi"].items()}
+    coord_features = checkpoint["coord_features"]
+    if not isinstance(coord_features, torch.Tensor):
+        coord_features = torch.tensor(coord_features, dtype=torch.float32)
+
+    model = JointRouteTransformerRegressor(
+        vocab_size=cfg["vocab_size"],
+        max_len=cfg["max_len"],
+        coord_features=coord_features,
+        d_model=cfg.get("d_model", 128),
+        nhead=cfg.get("nhead", 4),
+        num_layers=cfg.get("num_layers", 4),
+        dim_feedforward=cfg.get("dim_feedforward", 256),
+        dropout=cfg.get("dropout", 0.10),
+        pad_id=cfg.get("pad_id", stoi["<PAD>"]),
+    ).to(device)
+    model.load_state_dict(checkpoint["model_state_dict"])
+    model.eval()
+
+    return {
+        "model": model,
+        "stoi": stoi,
+        "pad_id": stoi["<PAD>"],
+        "unk_id": stoi["<UNK>"],
+        "max_len": cfg["max_len"],
+    }
+
+
+def predict_generated_grade(tokens: list[str], critic, device: torch.device) -> float:
+    """Use the critic model to predict the difficulty of a generated route.
+    
+    Args:
+        tokens: List of token strings (from generated route)
+        critic: Dictionary with model and vocabulary
+        device: torch device
+        
+    Returns:
+        Predicted difficulty score (continuous value)
+    """
+    model = critic["model"]
+    stoi = critic["stoi"]
+    pad_id = critic["pad_id"]
+    unk_id = critic["unk_id"]
+    max_len = critic["max_len"]
+
+    # Remove grade tokens (we want the model to predict, not see the grade)
+    tokens = [token for token in tokens if not token.startswith("<GRADE_")]
+    # Replace <BOS> with <CLS> for the encoder model
+    if tokens and tokens[0] == "<BOS>":
+        tokens = ["<CLS>"] + tokens[1:]
+    else:
+        tokens = ["<CLS>"] + tokens
+
+    # Encode tokens to IDs and pad to max_len
+    ids = [stoi.get(token, unk_id) for token in tokens][:max_len]
+    mask = [1] * len(ids)
+    if len(ids) < max_len:
+        pad_n = max_len - len(ids)
+        ids += [pad_id] * pad_n
+        mask += [0] * pad_n
+
+    with torch.no_grad():
+        input_ids = torch.tensor([ids], dtype=torch.long, device=device)
+        attention_mask = torch.tensor([mask], dtype=torch.bool, device=device)
+        return float(model(input_ids, attention_mask).cpu().item())
+
+
+def main() -> None:
+    """Main evaluation pipeline.
+    
+    Steps:
+    1. Load generated routes and real routes
+    2. Parse tokens and check validity
+    3. Compute novelty (Jaccard distance from nearest real route)
+    4. Compute geometric features
+    5. Optionally use critic model for grade consistency
+    6. Rank routes by composite score
+    7. Save evaluation results
+    """
+    args = parse_args()
+    args.out_dir.mkdir(parents=True, exist_ok=True)
+
+    # ─────────────────────────────────────────────────────────────────────
+    # Step 1: Load data
+    # ─────────────────────────────────────────────────────────────────────
+    generated_path = args.generated_dir / "generated_routes.csv"
+    routes_path = args.tokenized_dir / "route_sequences.csv"
+    token_meta_path = args.tokenized_dir / "token_metadata.csv"
+
+    if not generated_path.exists():
+        raise FileNotFoundError("Missing generated routes. Run scripts/03_train_route_generator.py first.")
+    if not routes_path.exists() or not token_meta_path.exists():
+        raise FileNotFoundError("Missing tokenized artifacts. Run scripts/01_tokenize_routes.py first.")
+
+    df_generated = pd.read_csv(generated_path)
+    df_real = pd.read_csv(routes_path)
+    df_token_meta = pd.read_csv(token_meta_path)
+
+    # ─────────────────────────────────────────────────────────────────────
+    # Step 2: Parse tokens and check validity
+    # ─────────────────────────────────────────────────────────────────────
+    # Validity checks ensure generated routes are structurally sound:
+    # - basic_valid: ≥3 holds, no duplicates, has start+finish, one board
+    # - strict_valid: basic_valid + has middle + ≥4 holds
+    df_generated["tokens_parsed"] = df_generated["tokens"].apply(parse_token_list)
+    df_generated["hold_records"] = df_generated["tokens_parsed"].apply(tokens_to_hold_records)
+    df_generated["hold_set"] = df_generated["hold_records"].apply(
+        lambda records: frozenset(int(record["placement_id"]) for record in records)
+    )
+
+    validity = pd.DataFrame(df_generated["hold_records"].apply(validity_from_records).tolist())
+    df_eval = pd.concat([df_generated.reset_index(drop=True), validity], axis=1)
+
+    print(f"Evaluated generated routes: {len(df_eval):,}")
+    print("\nBasic validity by board:")
+    print(df_eval.groupby("board_key")["basic_valid_eval"].mean())
+    print("\nStrict validity by board:")
+    print(df_eval.groupby("board_key")["strict_valid_eval"].mean())
+
+    # ─────────────────────────────────────────────────────────────────────
+    # Step 3: Novelty (Jaccard distance from nearest real route)
+    # ─────────────────────────────────────────────────────────────────────
+    # For each generated route, find the most similar real route on the
+    # same board using Jaccard similarity of hold sets.
+    # Novelty distance = 1 - Jaccard similarity
+    # A value of 1.0 means completely novel (no shared holds)
+    # A value of 0.0 means identical to an existing route
+    df_real["real_holds"] = df_real["frames"].apply(frames_to_holds)
+    df_real["hold_set"] = df_real["real_holds"].apply(holds_to_placement_set)
+
+    nearest = pd.DataFrame(
+        df_eval.apply(
+            lambda row: nearest_real_route_same_board(
+                generated_set=row["hold_set"],
+                generated_board_key=row["board_key"],
+                real_df=df_real,
+            ),
+            axis=1,
+        ).tolist()
+    )
+    df_eval = pd.concat([df_eval, nearest], axis=1)
+
+    print("\nNovelty statistics:")
+    print(df_eval[["board_key", "nearest_real_jaccard", "novelty_distance"]].describe())
+
+    # ─────────────────────────────────────────────────────────────────────
+    # Step 4: Geometric features
+    # ─────────────────────────────────────────────────────────────────────
+    # Compute simple spatial features for each generated route:
+    # - Number of holds
+    # - Height gained (max Y - min Y)
+    # - Width span (max X - min X)
+    # - Mean hand reach distance
+    coords = build_placement_coords(df_token_meta)
+    geom = pd.DataFrame(
+        df_eval.apply(
+            lambda row: simple_route_features(
+                board_key=row["board_key"],
+                records=row["hold_records"],
+                placement_coords=coords,
+            ),
+            axis=1,
+        ).tolist()
+    )
+    df_eval = pd.concat([df_eval, geom], axis=1)
+
+    print("\nGeometric feature statistics:")
+    print(df_eval[["board_key", "geom_n_holds", "geom_height", "geom_width", "geom_mean_hand_reach"]].describe())
+
+    # ─────────────────────────────────────────────────────────────────────
+    # Step 5: Grade consistency (using critic model)
+    # ─────────────────────────────────────────────────────────────────────
+    # If a trained grade predictor is available, use it as a "critic"
+    # to check whether generated routes have grades consistent with
+    # what was requested.
+    device = torch.device(args.device or ("cuda" if torch.cuda.is_available() else "cpu"))
+    critic = load_grade_critic(args.grade_model_path, device)
+    if critic is not None:
+        print("\nUsing grade critic for consistency scoring...")
+        df_eval["critic_pred_display_difficulty"] = df_eval["tokens_parsed"].apply(
+            lambda tokens: predict_generated_grade(tokens, critic, device)
+        )
+        df_eval["critic_pred_grouped_v"] = df_eval["critic_pred_display_difficulty"].apply(to_grouped_v)
+        df_eval["critic_v_error"] = df_eval["critic_pred_grouped_v"] - df_eval["requested_grouped_v"]
+
+        print("\nCritic grade consistency by board:")
+        summary = df_eval.groupby("board_key")["critic_v_error"].agg(
+            exact=lambda s: float((s == 0).mean() * 100),
+            within_1=lambda s: float((s.abs() <= 1).mean() * 100),
+            within_2=lambda s: float((s.abs() <= 2).mean() * 100),
+        )
+        print(summary)
+    else:
+        print("No trained grade critic found. Skipping critic-based scoring.")
+
+    # ─────────────────────────────────────────────────────────────────────
+    # Step 6: Rank routes by composite score
+    # ─────────────────────────────────────────────────────────────────────
+    # The composite score rewards:
+    # - Basic validity (weight 2.0)
+    # - Strict validity (weight 1.0)
+    # - Novelty (weight 1.0)
+    # - Grade consistency (weight 1.0 for ±1 V-grade, penalty for larger errors)
+    ranked = df_eval.copy()
+    ranked["score"] = 0.0
+    ranked["score"] += ranked["basic_valid_eval"].astype(float) * 2.0
+    ranked["score"] += ranked["strict_valid_eval"].astype(float) * 1.0
+    ranked["score"] += ranked["novelty_distance"].fillna(0.0)
+
+    if "critic_v_error" in ranked.columns:
+        ranked["score"] += (ranked["critic_v_error"].abs() <= 1).astype(float)
+        ranked["score"] -= 0.25 * ranked["critic_v_error"].abs()
+
+    top_candidates = ranked.sort_values("score", ascending=False).head(100).reset_index(drop=True)
+
+    print(f"\nTop 10 generated routes by composite score:")
+    display_cols = ["board_key", "score", "basic_valid_eval", "strict_valid_eval", "novelty_distance"]
+    if "critic_v_error" in top_candidates.columns:
+        display_cols.append("critic_v_error")
+    print(top_candidates[display_cols].head(10))
+
+    # ─────────────────────────────────────────────────────────────────────
+    # Step 7: Save results
+    # ─────────────────────────────────────────────────────────────────────
+    df_eval.to_csv(args.out_dir / "generated_route_evaluation.csv", index=False)
+    top_candidates.to_csv(args.out_dir / "top_generated_candidates.csv", index=False)
+
+    print(f"\nSaved evaluation results to:")
+    print(f"  {args.out_dir / 'generated_route_evaluation.csv'}")
+    print(f"  {args.out_dir / 'top_generated_candidates.csv'}")
+
+
+if __name__ == "__main__":
+    main()