#!/usr/bin/env python3 """ ClimbingBoardGPT — Grade Prediction Training Script This script trains a BERT-style transformer encoder to predict climb difficulty from tokenized route sequences. Architecture Overview: ---------------------- The model is a Transformer Encoder (similar to BERT) with a regression head: Input: ... ↓ Token Embedding + Position Embedding + Coordinate Features ↓ Transformer Encoder (4 layers, 4 heads, d_model=128) ↓ token output (pooled representation of the entire sequence) ↓ MLP Head → single scalar (predicted difficulty) Key Concepts: 1. pooling: The token aggregates information from the entire sequence via self-attention. This is the standard BERT approach for sequence-level tasks. 2. Coordinate features: Each hold token has physical (x, y) position information that gets projected and added to the embedding. This gives the model direct spatial knowledge without needing to learn it from data. 3. No grade token in input: The grade predictor must PREDICT the grade, not see it. We use the "no_grade" token sequence. 4. MSE loss: Since we're predicting a continuous value (difficulty score), we use Mean Squared Error loss rather than cross-entropy. 5. Joint training: Both TB2 and Kilter routes are trained together, with / tokens telling the model which board it's operating on. Usage: python scripts/02_train_grade_predictor.py python scripts/02_train_grade_predictor.py --epochs 100 --lr 1e-4 """ from __future__ import annotations import argparse import json import sys from pathlib import Path REPO_ROOT = Path(__file__).resolve().parents[1] sys.path.insert(0, str(REPO_ROOT / "src")) import numpy as np import pandas as pd import torch import torch.nn as nn from torch.utils.data import DataLoader from climbingboardgpt.datasets import RouteGradeDataset from climbingboardgpt.grades import to_grouped_v from climbingboardgpt.metrics import metrics_by_board, print_metrics, regression_metrics from climbingboardgpt.models import JointRouteTransformerRegressor from climbingboardgpt.tokenization import encode as encode_tokens from climbingboardgpt.utils import set_seed, write_json MSE_LOSS = nn.MSELoss() def parse_args() -> argparse.Namespace: """Parse command-line arguments for grade predictor training. Key hyperparameters: --epochs: Maximum training epochs (default: 75) --patience: Early stopping patience (default: 12) --batch-size: Batch size for training (default: 128) --lr: Learning rate (default: 3e-4) --d-model: Transformer embedding dimension (default: 128) --nhead: Number of attention heads (default: 4) --num-layers: Number of transformer layers (default: 4) """ parser = argparse.ArgumentParser( description="Train a joint TB2/Kilter transformer grade predictor.", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" The model predicts display_difficulty (a continuous value) from tokenized route sequences. Evaluation metrics include MAE, RMSE, R², and V-grade accuracy (within ±1 V-grade). """, ) parser.add_argument("--tokenized-dir", type=Path, default=REPO_ROOT / "data" / "processed" / "tokenized") parser.add_argument("--out-dir", type=Path, default=REPO_ROOT / "data" / "processed" / "grade_prediction") parser.add_argument("--model-dir", type=Path, default=REPO_ROOT / "models") parser.add_argument("--epochs", type=int, default=75, help="Maximum training epochs") parser.add_argument("--patience", type=int, default=12, help="Early stopping patience") parser.add_argument("--batch-size", type=int, default=128, help="Training batch size") parser.add_argument("--lr", type=float, default=3e-4, help="Learning rate") parser.add_argument("--weight-decay", type=float, default=1e-2, help="AdamW weight decay") parser.add_argument("--d-model", type=int, default=128, help="Transformer embedding dimension") parser.add_argument("--nhead", type=int, default=4, help="Number of attention heads") parser.add_argument("--num-layers", type=int, default=4, help="Number of transformer layers") parser.add_argument("--dim-feedforward", type=int, default=256, help="Feedforward dimension") parser.add_argument("--dropout", type=float, default=0.10, help="Dropout probability") parser.add_argument("--seed", type=int, default=3, help="Random seed") parser.add_argument("--device", type=str, default=None, help="Device (cpu or cuda)") parser.add_argument("--num-workers", type=int, default=0, help="DataLoader worker processes") parser.add_argument( "--smoke-test", action="store_true", help="Use a tiny CPU model and one epoch to exercise the training/evaluation code path.", ) return parser.parse_args() def apply_smoke_test_defaults(args: argparse.Namespace) -> None: """Mutate args to a tiny deterministic configuration for code-path checks.""" if not args.smoke_test: return args.epochs = 1 args.patience = 1 args.batch_size = min(args.batch_size, 16) args.d_model = 32 args.nhead = 2 args.num_layers = 1 args.dim_feedforward = 64 args.dropout = 0.0 args.device = "cpu" args.num_workers = 0 def build_coord_features(df_token_meta: pd.DataFrame, vocab_size: int) -> torch.Tensor: """Build coordinate feature matrix for the transformer model. Each token gets a 3-dimensional feature vector: - x_norm: Normalized horizontal position on the board (-1 to 1) - y_norm: Normalized vertical position on the board (-1 to 1) - is_hold: 1 if this token represents a hold, 0 otherwise These features are projected through a linear layer and added to the token embeddings, giving the model direct spatial information. This is analogous to how some vision-language models inject spatial features from images alongside text tokens. Args: df_token_meta: DataFrame with token metadata vocab_size: Total vocabulary size Returns: Tensor of shape (vocab_size, 3) with coordinate features """ features = np.zeros((vocab_size, 3), dtype=np.float32) for _, row in df_token_meta.iterrows(): token_id = int(row["token_id"]) features[token_id, 0] = 0.0 if pd.isna(row.get("x_norm", 0.0)) else float(row.get("x_norm", 0.0)) features[token_id, 1] = 0.0 if pd.isna(row.get("y_norm", 0.0)) else float(row.get("y_norm", 0.0)) features[token_id, 2] = 0.0 if pd.isna(row.get("is_hold", 0.0)) else float(row.get("is_hold", 0.0)) return torch.tensor(features, dtype=torch.float32) def run_epoch(model, loader, device, optimizer=None): """Run one epoch of training or evaluation. Args: model: The transformer model loader: DataLoader for this epoch device: torch device (cpu or cuda) optimizer: If provided, run training (with gradient updates). If None, run evaluation (no gradient updates). Returns: Tuple of (average_loss, predictions, targets, uuids, board_keys) """ is_train = optimizer is not None model.train(is_train) losses, preds, targets, row_ids, uuids, boards = [], [], [], [], [], [] for batch in loader: input_ids = batch["input_ids"].to(device) attention_mask = batch["attention_mask"].to(device) target = batch["target"].to(device) if is_train: optimizer.zero_grad(set_to_none=True) # Forward pass: model predicts difficulty from token sequence pred = model(input_ids, attention_mask) loss = MSE_LOSS(pred, target) if is_train: # Backward pass: compute gradients and update weights loss.backward() # Gradient clipping prevents exploding gradients torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() losses.append(loss.item() * input_ids.size(0)) preds.extend(pred.detach().cpu().numpy().tolist()) targets.extend(target.detach().cpu().numpy().tolist()) row_ids.extend(batch["row_id"].detach().cpu().numpy().tolist()) uuids.extend(batch["uuid"]) boards.extend(batch["board_key"]) avg_loss = sum(losses) / max(1, len(loader.dataset)) return avg_loss, np.asarray(preds), np.asarray(targets), row_ids, uuids, boards def main() -> None: """Main training loop for the grade predictor. Steps: 1. Load tokenized data and vocabulary 2. Prepare input sequences (with token, without grade) 3. Build coordinate features matrix 4. Create train/val/test DataLoaders 5. Initialize transformer model 6. Train with early stopping 7. Evaluate on test set 8. Save model checkpoint and metrics """ args = parse_args() apply_smoke_test_defaults(args) set_seed(args.seed) args.out_dir.mkdir(parents=True, exist_ok=True) args.model_dir.mkdir(parents=True, exist_ok=True) # ───────────────────────────────────────────────────────────────────── # Step 1: Load tokenized data # ───────────────────────────────────────────────────────────────────── seq_path = args.tokenized_dir / "route_sequences.csv" vocab_path = args.tokenized_dir / "token_vocab.json" meta_path = args.tokenized_dir / "token_metadata.csv" if not seq_path.exists() or not vocab_path.exists() or not meta_path.exists(): raise FileNotFoundError("Missing tokenized artifacts. Run scripts/01_tokenize_routes.py first.") df_routes = pd.read_csv(seq_path) vocab = json.loads(vocab_path.read_text(encoding="utf-8")) stoi = {str(k): int(v) for k, v in vocab["stoi"].items()} itos = {int(k): str(v) for k, v in vocab["itos"].items()} df_token_meta = pd.read_csv(meta_path) pad_id = stoi[""] device = torch.device(args.device or ("cuda" if torch.cuda.is_available() else "cpu")) # ───────────────────────────────────────────────────────────────────── # Step 2: Prepare input sequences # ───────────────────────────────────────────────────────────────────── # For grade prediction, we use the "no_grade" version of the sequence # and prepend for sequence-level pooling. # The model must PREDICT the grade, not see it in the input! df_routes["tokens_no_grade"] = df_routes["sequence_no_grade"].fillna("").str.split() df_routes["model_tokens"] = df_routes["tokens_no_grade"].apply( lambda tokens: [""] + tokens[1:] if tokens else [""] ) df_routes["model_ids"] = df_routes["model_tokens"].apply(lambda tokens: encode_tokens(tokens, stoi)) df_routes["seq_len"] = df_routes["model_ids"].apply(len) df_routes["row_id"] = np.arange(len(df_routes), dtype=np.int64) max_len = int(df_routes["seq_len"].max()) # ───────────────────────────────────────────────────────────────────── # Step 3: Create DataLoaders # ───────────────────────────────────────────────────────────────────── train_df = df_routes[df_routes["split"] == "train"].reset_index(drop=True) val_df = df_routes[df_routes["split"] == "val"].reset_index(drop=True) test_df = df_routes[df_routes["split"] == "test"].reset_index(drop=True) train_ds = RouteGradeDataset(train_df, max_len=max_len, pad_id=pad_id) val_ds = RouteGradeDataset(val_df, max_len=max_len, pad_id=pad_id) test_ds = RouteGradeDataset(test_df, max_len=max_len, pad_id=pad_id) loader_kwargs = { "num_workers": int(args.num_workers), "pin_memory": device.type == "cuda", } train_loader = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True, **loader_kwargs) val_loader = DataLoader(val_ds, batch_size=args.batch_size, shuffle=False, **loader_kwargs) test_loader = DataLoader(test_ds, batch_size=args.batch_size, shuffle=False, **loader_kwargs) # ───────────────────────────────────────────────────────────────────── # Step 4: Initialize model # ───────────────────────────────────────────────────────────────────── coord_features = build_coord_features(df_token_meta, vocab_size=len(stoi)) model = JointRouteTransformerRegressor( vocab_size=len(stoi), max_len=max_len, coord_features=coord_features, d_model=args.d_model, nhead=args.nhead, num_layers=args.num_layers, dim_feedforward=args.dim_feedforward, dropout=args.dropout, pad_id=pad_id, ).to(device) optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) print(f"Device: {device}") print(f"Train/val/test: {len(train_ds):,}, {len(val_ds):,}, {len(test_ds):,}") print(f"Vocabulary size: {len(stoi):,}") print(f"Max sequence length: {max_len}") print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}") # ───────────────────────────────────────────────────────────────────── # Step 5: Training loop with early stopping # ───────────────────────────────────────────────────────────────────── history = [] best_val_mae = float("inf") best_state = None best_epoch = 0 epochs_without_improvement = 0 print("\nStarting training...") for epoch in range(1, args.epochs + 1): train_loss, train_pred, train_true, _, _, _ = run_epoch(model, train_loader, device, optimizer) val_loss, val_pred, val_true, _, _, _ = run_epoch(model, val_loader, device, optimizer=None) train_metrics = regression_metrics(train_true, train_pred) val_metrics = regression_metrics(val_true, val_pred) history.append({ "epoch": epoch, "train_loss": train_loss, "val_loss": val_loss, "train_mae": train_metrics["mae"], "val_mae": val_metrics["mae"], "train_r2": train_metrics["r2"], "val_r2": val_metrics["r2"], "val_within_1_vgrade": val_metrics["within_1_vgrade"], }) # Track best model by validation MAE if val_metrics["mae"] < best_val_mae: best_val_mae = val_metrics["mae"] best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()} best_epoch = epoch epochs_without_improvement = 0 else: epochs_without_improvement += 1 if epoch == 1 or epoch % 5 == 0 or epoch == best_epoch: print( f"Epoch {epoch:03d} | " f"train MAE {train_metrics['mae']:.3f} | " f"val MAE {val_metrics['mae']:.3f} | " f"val R² {val_metrics['r2']:.3f} | " f"val ±1V {val_metrics['within_1_vgrade']:.1f}%" ) if epochs_without_improvement >= args.patience: print(f"Early stopping at epoch {epoch}; best epoch was {best_epoch}.") break # Load best model if best_state is not None: model.load_state_dict(best_state) # ───────────────────────────────────────────────────────────────────── # Step 6: Test set evaluation # ───────────────────────────────────────────────────────────────────── test_loss, test_pred, test_true, test_row_id, test_uuid, test_board = run_epoch(model, test_loader, device, optimizer=None) overall_metrics = regression_metrics(test_true, test_pred) pred_df = pd.DataFrame({ "row_id": test_row_id, "uuid": test_uuid, "board_key": test_board, "y_true": test_true, "y_pred": test_pred, "abs_error": np.abs(test_true - test_pred), "true_v": [to_grouped_v(value) for value in test_true], "pred_v": [to_grouped_v(value) for value in test_pred], }) board_metrics_df = metrics_by_board(pred_df) pred_df = pred_df.merge( df_routes[["row_id", "climb_name", "angle", "boulder_grade", "sequence_no_grade"]], on="row_id", how="left", validate="one_to_one", ) print_metrics("Overall joint test performance", overall_metrics) print("\nBoard-specific test performance:") print(board_metrics_df.to_string(index=False)) # ───────────────────────────────────────────────────────────────────── # Step 7: Save artifacts # ───────────────────────────────────────────────────────────────────── pd.DataFrame(history).to_csv(args.out_dir / "training_history.csv", index=False) pred_df.to_csv(args.out_dir / "test_predictions.csv", index=False) board_metrics_df.to_csv(args.out_dir / "board_metrics.csv", index=False) write_json(args.out_dir / "overall_metrics.json", overall_metrics) # Save model checkpoint with all necessary info for loading checkpoint = { "model_state_dict": model.state_dict(), "config": { "vocab_size": len(stoi), "max_len": max_len, "d_model": args.d_model, "nhead": args.nhead, "num_layers": args.num_layers, "dim_feedforward": args.dim_feedforward, "dropout": args.dropout, "pad_id": pad_id, }, "stoi": stoi, "itos": {str(k): v for k, v in itos.items()}, "coord_features": coord_features.cpu(), "overall_metrics": overall_metrics, } model_path = args.model_dir / "joint_transformer_grade_predictor.pth" torch.save(checkpoint, model_path) print("\nSaved:") print(f" {args.out_dir}") print(f" {model_path}") if __name__ == "__main__": main()