Update notebook results and README stats

2026-06-08 13:20:19 -04:00
parent f87d116c03
commit 874de6c0fb
5 changed files with 2679 additions and 154 deletions
@@ -30,7 +30,7 @@
    "A climb's difficulty depends on the *relationships between holds*, not just individual holds. Self-attention naturally captures these relationships:\n",
    "\n",
    "- A start hold far from the first middle hold suggests a big opening move\n",
-    "- Two hand holds close together with a foot hold far away suggests a dyno\n",
+    "- Two holds that are very far apart suggest a dyno\n",
    "- The overall spatial distribution determines the \"flow\" of the climb\n",
    "\n",
    "The transformer can learn these spatial relationships through attention, without us having to manually engineer features like \"mean hand reach\" or \"height gained\" (though those features were useful in the classical model).\n",
@@ -47,40 +47,57 @@
    "\n",
    "```text\n",
    "display_difficulty (continuous value, e.g., 20.5)\n",
-    "```"
+    "```\n",
+    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3dfd6081",
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-06-07T15:48:37.490884Z",
+     "iopub.status.busy": "2026-06-07T15:48:37.490209Z",
+     "iopub.status.idle": "2026-06-07T15:48:42.972689Z",
+     "shell.execute_reply": "2026-06-07T15:48:42.971662Z"
+    }
+   },
   "outputs": [],
   "source": [
-    "from pathlib import Path\n",
-    "import sys\n",
+    "from __future__ import annotations\n",
+    "\n",
    "import json\n",
+    "import math\n",
+    "from pathlib import Path\n",
+    "from typing import Any\n",
+    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import torch\n",
    "import torch.nn as nn\n",
-    "from torch.utils.data import DataLoader\n",
+    "import torch.nn.functional as F\n",
+    "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
+    "from torch.utils.data import DataLoader, Dataset\n",
    "\n",
    "ROOT = Path.cwd().resolve()\n",
    "if ROOT.name == \"notebooks\":\n",
    "    ROOT = ROOT.parent\n",
-    "sys.path.insert(0, str(ROOT / \"src\"))\n",
-    "\n",
-    "from climbingboardgpt.datasets import RouteGradeDataset\n",
-    "from climbingboardgpt.metrics import regression_metrics, metrics_by_board\n",
-    "from climbingboardgpt.models import JointRouteTransformerRegressor"
+    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8a9e2443",
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-06-07T15:48:42.976137Z",
+     "iopub.status.busy": "2026-06-07T15:48:42.975792Z",
+     "iopub.status.idle": "2026-06-07T15:48:48.768984Z",
+     "shell.execute_reply": "2026-06-07T15:48:48.768115Z"
+    }
+   },
   "outputs": [],
   "source": [
    "TOKENIZED = ROOT / \"data\" / \"processed\" / \"tokenized\"\n",
@@ -95,7 +112,8 @@
    "unk_id = stoi[\"<UNK>\"]\n",
    "\n",
    "print(f\"Vocabulary size: {len(stoi):,}\")\n",
-    "print(f\"Total routes: {len(df_routes):,}\")"
+    "print(f\"Total routes: {len(df_routes):,}\")\n",
+    "\n"
   ]
  },
  {
@@ -114,14 +132,22 @@
    "2. `y_norm`: Normalized vertical position on the board (-1 to 1)\n",
    "3. `is_hold`: 1 if this token represents a hold, 0 otherwise\n",
    "\n",
-    "These features are projected through a linear layer and added to the token embeddings. This is similar to how some vision-language models inject spatial features from images alongside text tokens."
+    "These features are projected through a linear layer and added to the token embeddings. This is similar to how some vision-language models inject spatial features from images alongside text tokens.\n",
+    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "95bb745f",
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-06-07T15:48:48.772384Z",
+     "iopub.status.busy": "2026-06-07T15:48:48.771749Z",
+     "iopub.status.idle": "2026-06-07T15:48:52.916642Z",
+     "shell.execute_reply": "2026-06-07T15:48:52.915616Z"
+    }
+   },
   "outputs": [],
   "source": [
    "def encode(tokens):\n",
@@ -153,7 +179,73 @@
    "\n",
    "print(f\"Max sequence length: {max_len}\")\n",
    "print(f\"Coordinate features shape: {coord_features.shape}\")\n",
-    "print(f\"Vocabulary size: {len(stoi)}\")"
+    "print(f\"Vocabulary size: {len(stoi)}\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9033f9e8",
+   "metadata": {},
+   "source": [
+    "### Dataset helper"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c55c1d26",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-06-07T15:48:52.920221Z",
+     "iopub.status.busy": "2026-06-07T15:48:52.919793Z",
+     "iopub.status.idle": "2026-06-07T15:48:52.927627Z",
+     "shell.execute_reply": "2026-06-07T15:48:52.926737Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Pad route-token sequences for transformer grade prediction.\n",
+    "class RouteGradeDataset(Dataset):\n",
+    "    \"\"\"Dataset for transformer encoder grade prediction.\n",
+    "\n",
+    "    Each item returns a padded token sequence, a boolean attention mask, the\n",
+    "    continuous display-difficulty target, and a small amount of route identity\n",
+    "    metadata used when writing prediction CSVs.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    def __init__(self, df, max_len: int, pad_id: int):\n",
+    "        \"\"\"Store model IDs and labels from a tokenized route DataFrame.\"\"\"\n",
+    "        self.row_ids = df[\"row_id\"].tolist() if \"row_id\" in df.columns else df.index.tolist()\n",
+    "        self.ids = df[\"model_ids\"].tolist()\n",
+    "        self.targets = df[\"display_difficulty\"].astype(float).values\n",
+    "        self.uuids = df[\"uuid\"].tolist()\n",
+    "        self.boards = df[\"board_key\"].astype(str).tolist()\n",
+    "        self.max_len = int(max_len)\n",
+    "        self.pad_id = int(pad_id)\n",
+    "\n",
+    "    def __len__(self) -> int:\n",
+    "        \"\"\"Return the number of route examples.\"\"\"\n",
+    "        return len(self.ids)\n",
+    "\n",
+    "    def __getitem__(self, idx: int):\n",
+    "        \"\"\"Return one padded encoder example and its regression target.\"\"\"\n",
+    "        ids = list(self.ids[idx])[: self.max_len]\n",
+    "        mask = [1] * len(ids)\n",
+    "        if len(ids) < self.max_len:\n",
+    "            pad_n = self.max_len - len(ids)\n",
+    "            ids += [self.pad_id] * pad_n\n",
+    "            mask += [0] * pad_n\n",
+    "\n",
+    "        return {\n",
+    "            \"input_ids\": torch.tensor(ids, dtype=torch.long),\n",
+    "            \"attention_mask\": torch.tensor(mask, dtype=torch.bool),\n",
+    "            \"target\": torch.tensor(self.targets[idx], dtype=torch.float32),\n",
+    "            \"row_id\": int(self.row_ids[idx]),\n",
+    "            \"uuid\": self.uuids[idx],\n",
+    "            \"board_key\": self.boards[idx],\n",
+    "        }\n",
+    "\n"
   ]
  },
  {
@@ -178,14 +270,22 @@
    "- `input_ids`: Integer token IDs, padded to `max_len`\n",
    "- `attention_mask`: 1 for real tokens, 0 for padding\n",
    "- `target`: The difficulty score we want to predict\n",
-    "- `uuid`, `board_key`: Metadata for evaluation"
+    "- `uuid`, `board_key`: Metadata for evaluation\n",
+    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2c9e5543",
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-06-07T15:48:52.930809Z",
+     "iopub.status.busy": "2026-06-07T15:48:52.930299Z",
+     "iopub.status.idle": "2026-06-07T15:48:53.612170Z",
+     "shell.execute_reply": "2026-06-07T15:48:53.611156Z"
+    }
+   },
   "outputs": [],
   "source": [
    "train_df = df_routes[df_routes[\"split\"] == \"train\"].reset_index(drop=True)\n",
@@ -202,7 +302,106 @@
    "\n",
    "print(f\"Training samples: {len(train_ds):,}\")\n",
    "print(f\"Validation samples: {len(val_ds):,}\")\n",
-    "print(f\"Test samples: {len(test_ds):,}\")"
+    "print(f\"Test samples: {len(test_ds):,}\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "03091a62",
+   "metadata": {},
+   "source": [
+    "### Transformer regressor model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "78612fe7",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-06-07T15:48:53.616012Z",
+     "iopub.status.busy": "2026-06-07T15:48:53.615396Z",
+     "iopub.status.idle": "2026-06-07T15:48:53.640842Z",
+     "shell.execute_reply": "2026-06-07T15:48:53.639849Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Transformer encoder used as a continuous grade regressor.\n",
+    "class JointRouteTransformerRegressor(nn.Module):\n",
+    "    \"\"\"Transformer encoder for joint TB2/Kilter route difficulty prediction.\n",
+    "\n",
+    "    Inputs are token IDs plus an attention mask. Token, position, and learned\n",
+    "    projections of coordinate metadata are added before the encoder. The first\n",
+    "    ``<CLS>`` position is then used as a pooled route representation for scalar\n",
+    "    difficulty regression.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    def __init__(\n",
+    "        self,\n",
+    "        vocab_size: int,\n",
+    "        max_len: int,\n",
+    "        coord_features: torch.Tensor,\n",
+    "        d_model: int = 128,\n",
+    "        nhead: int = 4,\n",
+    "        num_layers: int = 4,\n",
+    "        dim_feedforward: int = 256,\n",
+    "        dropout: float = 0.10,\n",
+    "        pad_id: int = 0,\n",
+    "    ):\n",
+    "        \"\"\"Create the encoder, coordinate projection, and regression head.\"\"\"\n",
+    "        super().__init__()\n",
+    "        self.vocab_size = vocab_size\n",
+    "        self.max_len = max_len\n",
+    "        self.d_model = d_model\n",
+    "        self.pad_id = pad_id\n",
+    "\n",
+    "        self.token_emb = nn.Embedding(vocab_size, d_model, padding_idx=pad_id)\n",
+    "        self.pos_emb = nn.Embedding(max_len, d_model)\n",
+    "\n",
+    "        self.register_buffer(\"coord_features\", coord_features.clone().float())\n",
+    "        self.coord_proj = nn.Linear(coord_features.shape[1], d_model)\n",
+    "\n",
+    "        encoder_layer = nn.TransformerEncoderLayer(\n",
+    "            d_model=d_model,\n",
+    "            nhead=nhead,\n",
+    "            dim_feedforward=dim_feedforward,\n",
+    "            dropout=dropout,\n",
+    "            activation=\"gelu\",\n",
+    "            batch_first=True,\n",
+    "            norm_first=True,\n",
+    "        )\n",
+    "        self.encoder = nn.TransformerEncoder(\n",
+    "            encoder_layer,\n",
+    "            num_layers=num_layers,\n",
+    "            enable_nested_tensor=False,\n",
+    "        )\n",
+    "        self.norm = nn.LayerNorm(d_model)\n",
+    "        self.head = nn.Sequential(\n",
+    "            nn.Linear(d_model, d_model),\n",
+    "            nn.GELU(),\n",
+    "            nn.Dropout(dropout),\n",
+    "            nn.Linear(d_model, 1),\n",
+    "        )\n",
+    "\n",
+    "    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:\n",
+    "        \"\"\"Return one continuous difficulty prediction per input sequence.\"\"\"\n",
+    "        batch_size, seq_len = input_ids.shape\n",
+    "        positions = torch.arange(seq_len, device=input_ids.device).unsqueeze(0).expand(batch_size, seq_len)\n",
+    "\n",
+    "        # Coordinate features are indexed by token ID, so every occurrence of a\n",
+    "        # hold token gets the same physical x/y hint wherever it appears.\n",
+    "        x = self.token_emb(input_ids) + self.pos_emb(positions)\n",
+    "        x = x + self.coord_proj(self.coord_features[input_ids])\n",
+    "\n",
+    "        key_padding_mask = ~attention_mask.bool()\n",
+    "        h = self.encoder(x, src_key_padding_mask=key_padding_mask)\n",
+    "        h = self.norm(h)\n",
+    "\n",
+    "        cls_state = h[:, 0, :]\n",
+    "        return self.head(cls_state).squeeze(-1)\n",
+    "\n"
   ]
  },
  {
@@ -235,14 +434,22 @@
    "- `nhead=4`: Number of attention heads (multi-head attention)\n",
    "- `num_layers=4`: Number of transformer layers\n",
    "- `dim_feedforward=256`: Dimension of the feedforward network inside each layer\n",
-    "- `dropout=0.10`: Dropout probability for regularization"
+    "- `dropout=0.10`: Dropout probability for regularization\n",
+    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "62c2db48",
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-06-07T15:48:53.644453Z",
+     "iopub.status.busy": "2026-06-07T15:48:53.643654Z",
+     "iopub.status.idle": "2026-06-07T15:48:59.327913Z",
+     "shell.execute_reply": "2026-06-07T15:48:59.326972Z"
+    }
+   },
   "outputs": [],
   "source": [
    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
@@ -262,7 +469,8 @@
    "optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-2)\n",
    "\n",
    "print(f\"Device: {device}\")\n",
-    "print(f\"Parameters: {sum(p.numel() for p in model.parameters()):,}\")"
+    "print(f\"Parameters: {sum(p.numel() for p in model.parameters()):,}\")\n",
+    "\n"
   ]
  },
  {
@@ -284,14 +492,22 @@
    "\n",
    "### Early stopping\n",
    "\n",
-    "We stop training if validation loss doesn't improve for `patience` epochs. This prevents overfitting and saves compute."
+    "We stop training if validation loss doesn't improve for `patience` epochs. This prevents overfitting and saves compute.\n",
+    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "665deadb",
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-06-07T15:48:59.331996Z",
+     "iopub.status.busy": "2026-06-07T15:48:59.331485Z",
+     "iopub.status.idle": "2026-06-07T15:48:59.340181Z",
+     "shell.execute_reply": "2026-06-07T15:48:59.339495Z"
+    }
+   },
   "outputs": [],
   "source": [
    "def run_epoch(model, loader, device, optimizer=None):\n",
@@ -341,7 +557,90 @@
    "patience = 12\n",
    "\n",
    "print(f\"Max epochs: {num_epochs}\")\n",
-    "print(f\"Early stopping patience: {patience}\")"
+    "print(f\"Early stopping patience: {patience}\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0e5bb77f",
+   "metadata": {},
+   "source": [
+    "### Grade metrics helpers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aeeb2294",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-06-07T15:48:59.343447Z",
+     "iopub.status.busy": "2026-06-07T15:48:59.342978Z",
+     "iopub.status.idle": "2026-06-07T15:48:59.353066Z",
+     "shell.execute_reply": "2026-06-07T15:48:59.352152Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Map BoardLib display difficulties into grouped V-grade tokens.\n",
+    "GRADE_TO_V = {\n",
+    "    10: 0, 11: 0, 12: 0,\n",
+    "    13: 1, 14: 1,\n",
+    "    15: 2,\n",
+    "    16: 3, 17: 3,\n",
+    "    18: 4, 19: 4,\n",
+    "    20: 5, 21: 5,\n",
+    "    22: 6,\n",
+    "    23: 7,\n",
+    "    24: 8, 25: 8,\n",
+    "    26: 9,\n",
+    "    27: 10,\n",
+    "    28: 11,\n",
+    "    29: 12,\n",
+    "    30: 13,\n",
+    "    31: 14,\n",
+    "    32: 15,\n",
+    "    33: 16,\n",
+    "}\n",
+    "\n",
+    "def to_grouped_v(display_difficulty: float) -> int:\n",
+    "    \"\"\"Map a continuous display difficulty to the nearest grouped V grade.\"\"\"\n",
+    "    rounded = int(round(float(display_difficulty)))\n",
+    "    rounded = max(min(rounded, max(GRADE_TO_V)), min(GRADE_TO_V))\n",
+    "    return GRADE_TO_V[rounded]\n",
+    "\n",
+    "def grade_token(display_difficulty: float) -> str:\n",
+    "    \"\"\"Return the grade-conditioning token for a display difficulty value.\"\"\"\n",
+    "    return f\"<GRADE_V{to_grouped_v(display_difficulty)}>\"\n",
+    "\n",
+    "# Evaluate difficulty regression and grouped V-grade accuracy.\n",
+    "def regression_metrics(y_true, y_pred) -> dict[str, float]:\n",
+    "    \"\"\"Compute difficulty-scale and grouped-V-grade prediction metrics.\"\"\"\n",
+    "    y_true = np.asarray(y_true)\n",
+    "    y_pred = np.asarray(y_pred)\n",
+    "    true_v = np.asarray([to_grouped_v(x) for x in y_true])\n",
+    "    pred_v = np.asarray([to_grouped_v(x) for x in y_pred])\n",
+    "\n",
+    "    return {\n",
+    "        \"mae\": float(mean_absolute_error(y_true, y_pred)),\n",
+    "        \"rmse\": float(math.sqrt(mean_squared_error(y_true, y_pred))),\n",
+    "        \"r2\": float(r2_score(y_true, y_pred)),\n",
+    "        \"within_1_difficulty\": float(np.mean(np.abs(y_true - y_pred) <= 1) * 100),\n",
+    "        \"within_2_difficulty\": float(np.mean(np.abs(y_true - y_pred) <= 2) * 100),\n",
+    "        \"exact_grouped_v\": float(np.mean(true_v == pred_v) * 100),\n",
+    "        \"within_1_vgrade\": float(np.mean(np.abs(true_v - pred_v) <= 1) * 100),\n",
+    "        \"within_2_vgrades\": float(np.mean(np.abs(true_v - pred_v) <= 2) * 100),\n",
+    "    }\n",
+    "\n",
+    "def metrics_by_board(pred_df: pd.DataFrame) -> pd.DataFrame:\n",
+    "    \"\"\"Compute regression metrics separately for each board in a prediction table.\"\"\"\n",
+    "    rows = []\n",
+    "    for board_key, frame in pred_df.groupby(\"board_key\"):\n",
+    "        metrics = regression_metrics(frame[\"y_true\"].values, frame[\"y_pred\"].values)\n",
+    "        rows.append({\"board_key\": board_key, **metrics})\n",
+    "    return pd.DataFrame(rows)\n",
+    "\n"
   ]
  },
  {
@@ -360,14 +659,22 @@
    "5. **Validate**: Check performance on held-out validation data\n",
    "6. **Early stopping**: Stop if validation loss stops improving\n",
    "\n",
-    "We track both fine-grained metrics (MAE, RMSE) and practical metrics (V-grade accuracy within ±1 grade)."
+    "We track both fine-grained metrics (MAE, RMSE) and practical metrics (V-grade accuracy within ±1 grade).\n",
+    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "476b158d",
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-06-07T15:48:59.356313Z",
+     "iopub.status.busy": "2026-06-07T15:48:59.355799Z",
+     "iopub.status.idle": "2026-06-07T19:11:46.644946Z",
+     "shell.execute_reply": "2026-06-07T19:11:46.644060Z"
+    }
+   },
   "outputs": [],
   "source": [
    "history = []\n",
@@ -420,7 +727,8 @@
    "if best_state is not None:\n",
    "    model.load_state_dict(best_state)\n",
    "\n",
-    "print(f\"\\nTraining complete. Best epoch: {best_epoch}, Best val MAE: {best_val_mae:.4f}\")"
+    "print(f\"\\nTraining complete. Best epoch: {best_epoch}, Best val MAE: {best_val_mae:.4f}\")\n",
+    "\n"
   ]
  },
  {
@@ -438,14 +746,22 @@
    "- **Within ±1 difficulty**: Percentage of predictions within 1 point\n",
    "- **Within ±1 V-grade**: Percentage of predictions within 1 V-grade\n",
    "\n",
-    "We also break down performance by board (TB2 vs Kilter) to check for bias."
+    "We also break down performance by board (TB2 vs Kilter) to check for bias.\n",
+    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9abc3a72",
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-06-07T19:11:46.648067Z",
+     "iopub.status.busy": "2026-06-07T19:11:46.647798Z",
+     "iopub.status.idle": "2026-06-07T19:12:05.427217Z",
+     "shell.execute_reply": "2026-06-07T19:12:05.426288Z"
+    }
+   },
   "outputs": [],
   "source": [
    "test_loss, test_pred, test_true, test_uuid, test_board = run_epoch(model, test_loader, device, optimizer=None)\n",
@@ -467,7 +783,60 @@
    "    print(f\"{key:24s}: {value:8.4f}{suffix}\")\n",
    "\n",
    "print(\"\\nBoard-specific test performance:\")\n",
-    "print(board_metrics_df.to_string(index=False))"
+    "print(board_metrics_df.to_string(index=False))\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "01c90e93",
+   "metadata": {},
+   "source": [
+    "### JSON output helpers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3027d982",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-06-07T19:12:05.430611Z",
+     "iopub.status.busy": "2026-06-07T19:12:05.430084Z",
+     "iopub.status.idle": "2026-06-07T19:12:05.436838Z",
+     "shell.execute_reply": "2026-06-07T19:12:05.436135Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Write JSON artifacts after converting NumPy/pandas values to plain Python values.\n",
+    "def json_safe(obj: Any) -> Any:\n",
+    "    \"\"\"Convert NumPy/pandas values into JSON-serializable Python objects.\"\"\"\n",
+    "    if isinstance(obj, dict):\n",
+    "        return {str(k): json_safe(v) for k, v in obj.items()}\n",
+    "    if isinstance(obj, (list, tuple)):\n",
+    "        return [json_safe(v) for v in obj]\n",
+    "    if isinstance(obj, np.integer):\n",
+    "        return int(obj)\n",
+    "    if isinstance(obj, np.floating):\n",
+    "        if np.isnan(obj):\n",
+    "            return None\n",
+    "        return float(obj)\n",
+    "    if isinstance(obj, np.ndarray):\n",
+    "        return json_safe(obj.tolist())\n",
+    "    try:\n",
+    "        if pd.isna(obj):\n",
+    "            return None\n",
+    "    except Exception:\n",
+    "        pass\n",
+    "    return obj\n",
+    "\n",
+    "def write_json(path: str | Path, payload: Any) -> None:\n",
+    "    \"\"\"Write an object as indented UTF-8 JSON after ``json_safe`` cleanup.\"\"\"\n",
+    "    path = Path(path)\n",
+    "    path.parent.mkdir(parents=True, exist_ok=True)\n",
+    "    path.write_text(json.dumps(json_safe(payload), indent=2), encoding=\"utf-8\")\n",
+    "\n"
   ]
  },
  {
@@ -477,14 +846,22 @@
   "source": [
    "## Save Model and Artifacts\n",
    "\n",
-    "We save the trained model checkpoint and evaluation metrics for use in notebook 04 (route evaluation) and for future inference."
+    "We save the trained model checkpoint and evaluation metrics for use in notebook 04 (route evaluation) and for future inference.\n",
+    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "save_model",
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-06-07T19:12:05.439746Z",
+     "iopub.status.busy": "2026-06-07T19:12:05.439205Z",
+     "iopub.status.idle": "2026-06-07T19:12:05.604325Z",
+     "shell.execute_reply": "2026-06-07T19:12:05.603607Z"
+    }
+   },
   "outputs": [],
   "source": [
    "# Save model checkpoint\n",
@@ -520,13 +897,14 @@
    "pred_df.to_csv(OUT_DIR / \"test_predictions.csv\", index=False)\n",
    "board_metrics_df.to_csv(OUT_DIR / \"board_metrics.csv\", index=False)\n",
    "\n",
-    "from climbingboardgpt.utils import write_json\n",
+    "# write_json is defined in the JSON output helper cell above.\n",
    "write_json(OUT_DIR / \"overall_metrics.json\", overall_metrics)\n",
    "\n",
    "print(f\"Saved model checkpoint to: {model_path}\")\n",
    "print(f\"Saved training history to: {OUT_DIR / 'training_history.csv'}\")\n",
    "print(f\"Saved test predictions to: {OUT_DIR / 'test_predictions.csv'}\")\n",
-    "print(f\"Saved board metrics to: {OUT_DIR / 'board_metrics.csv'}\")"
+    "print(f\"Saved board metrics to: {OUT_DIR / 'board_metrics.csv'}\")\n",
+    "\n"
   ]
  },
  {
@@ -542,7 +920,8 @@
    "\n",
    "3. **Joint training across boards**: By training on both TB2 and Kilter data simultaneously, the model can share statistical strength. The board token (`<BOARD_TB2>` vs `<BOARD_KILTER>`) tells it which \"language\" it's operating in.\n",
    "\n",
-    "4. **The gap between fine-grained and grouped metrics**: Being off by 1 difficulty point often stays within the same V-grade bucket. This is why the ±1 V-grade accuracy is much higher than the ±1 difficulty accuracy."
+    "4. **The gap between fine-grained and grouped metrics**: Being off by 1 difficulty point often stays within the same V-grade bucket. This is why the ±1 V-grade accuracy is much higher than the ±1 difficulty accuracy.\n",
+    "\n"
   ]
  }
 ],
@@ -553,8 +932,16 @@
   "name": "python3"
  },
  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
   "name": "python",
-   "version": "3.14.4"
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.12"
  }
 },
 "nbformat": 4,