From 9874c19daa5c9c84a209b5840af89eef2d740aa0 Mon Sep 17 00:00:00 2001
From: Pawel <pawel.sarkowicz@ik.me>
Date: Fri, 29 May 2026 14:47:33 -0400
Subject: [PATCH] docs: add performance and architectural notes, narrow
 exception handling

---
 src/climbingboardgpt/evaluation.py   | 12 +++++++++++-
 src/climbingboardgpt/models.py       | 17 +++++++++++++++++
 src/climbingboardgpt/tokenization.py |  2 +-
 3 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/src/climbingboardgpt/evaluation.py b/src/climbingboardgpt/evaluation.py
index 18eb1df..d08e854 100644
--- a/src/climbingboardgpt/evaluation.py
+++ b/src/climbingboardgpt/evaluation.py
@@ -84,7 +84,17 @@ def nearest_real_route_same_board(
     generated_board_key: str,
     real_df: pd.DataFrame,
 ) -> dict[str, object]:
-    """Find the most similar real route on the same board by Jaccard score."""
+    """Find the most similar real route on the same board by Jaccard score.
+
+    .. note::
+
+       This function performs an O(n) linear scan over all real routes for
+       the matching board, computing a Jaccard similarity for each one. With
+       ~256K training examples, evaluating 400 generated routes costs roughly
+       O(100M) Jaccard comparisons. This is acceptable for evaluation scripts
+       but would not scale to a real-time or high-throughput setting without
+       an approximate nearest-neighbour index.
+    """
     board_frame = real_df[real_df["board_key"] == generated_board_key]
     if board_frame.empty:
         return {
diff --git a/src/climbingboardgpt/models.py b/src/climbingboardgpt/models.py
index 893aabe..5815fa4 100644
--- a/src/climbingboardgpt/models.py
+++ b/src/climbingboardgpt/models.py
@@ -85,6 +85,23 @@ class JointRouteGPT(nn.Module):
 
     PyTorch's ``TransformerEncoder`` is used with a causal mask, which makes it
     behave like a decoder-only language model for short route sequences.
+
+    Why use ``TransformerEncoder`` rather than ``TransformerDecoder``?
+    -------------------------------------------------------------------
+    PyTorch's ``TransformerDecoderLayer`` expects two inputs: a decoder
+    sequence and a separate encoder memory for cross-attention. For
+    unconditional or prompt-conditioned generation there is no encoder,
+    so ``TransformerDecoderLayer`` would always ignore the second input
+    or require a dummy placeholder. Using ``TransformerEncoder`` with a
+    causal mask avoids this mismatch, keeps the module list uniform,
+    and produces identical behaviour for short autoregressive generation.
+
+    The trade-off is that ``TransformerEncoder`` does not natively prevent
+    attention to future positions — the causal mask must be constructed
+    manually (see ``forward``). For the sequence lengths seen here
+    (at most ~400 tokens) the overhead of the upper-triangular mask is
+    negligible, and ``enable_nested_tensor=False`` is set to avoid SDPA
+    optimisations that do not support masked encoders.
     """
 
     def __init__(
diff --git a/src/climbingboardgpt/tokenization.py b/src/climbingboardgpt/tokenization.py
index 8931577..ef7469a 100644
--- a/src/climbingboardgpt/tokenization.py
+++ b/src/climbingboardgpt/tokenization.py
@@ -66,7 +66,7 @@ def parse_tokens(value) -> list[str]:
         parsed = ast.literal_eval(value)
         if isinstance(parsed, list):
             return [str(v) for v in parsed]
-    except Exception:
+    except (SyntaxError, ValueError):
         pass
 
     return value.split()