From 9874c19daa5c9c84a209b5840af89eef2d740aa0 Mon Sep 17 00:00:00 2001 From: Pawel Date: Fri, 29 May 2026 14:47:33 -0400 Subject: [PATCH] docs: add performance and architectural notes, narrow exception handling --- src/climbingboardgpt/evaluation.py | 12 +++++++++++- src/climbingboardgpt/models.py | 17 +++++++++++++++++ src/climbingboardgpt/tokenization.py | 2 +- 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/src/climbingboardgpt/evaluation.py b/src/climbingboardgpt/evaluation.py index 18eb1df..d08e854 100644 --- a/src/climbingboardgpt/evaluation.py +++ b/src/climbingboardgpt/evaluation.py @@ -84,7 +84,17 @@ def nearest_real_route_same_board( generated_board_key: str, real_df: pd.DataFrame, ) -> dict[str, object]: - """Find the most similar real route on the same board by Jaccard score.""" + """Find the most similar real route on the same board by Jaccard score. + + .. note:: + + This function performs an O(n) linear scan over all real routes for + the matching board, computing a Jaccard similarity for each one. With + ~256K training examples, evaluating 400 generated routes costs roughly + O(100M) Jaccard comparisons. This is acceptable for evaluation scripts + but would not scale to a real-time or high-throughput setting without + an approximate nearest-neighbour index. + """ board_frame = real_df[real_df["board_key"] == generated_board_key] if board_frame.empty: return { diff --git a/src/climbingboardgpt/models.py b/src/climbingboardgpt/models.py index 893aabe..5815fa4 100644 --- a/src/climbingboardgpt/models.py +++ b/src/climbingboardgpt/models.py @@ -85,6 +85,23 @@ class JointRouteGPT(nn.Module): PyTorch's ``TransformerEncoder`` is used with a causal mask, which makes it behave like a decoder-only language model for short route sequences. + + Why use ``TransformerEncoder`` rather than ``TransformerDecoder``? + ------------------------------------------------------------------- + PyTorch's ``TransformerDecoderLayer`` expects two inputs: a decoder + sequence and a separate encoder memory for cross-attention. For + unconditional or prompt-conditioned generation there is no encoder, + so ``TransformerDecoderLayer`` would always ignore the second input + or require a dummy placeholder. Using ``TransformerEncoder`` with a + causal mask avoids this mismatch, keeps the module list uniform, + and produces identical behaviour for short autoregressive generation. + + The trade-off is that ``TransformerEncoder`` does not natively prevent + attention to future positions — the causal mask must be constructed + manually (see ``forward``). For the sequence lengths seen here + (at most ~400 tokens) the overhead of the upper-triangular mask is + negligible, and ``enable_nested_tensor=False`` is set to avoid SDPA + optimisations that do not support masked encoders. """ def __init__( diff --git a/src/climbingboardgpt/tokenization.py b/src/climbingboardgpt/tokenization.py index 8931577..ef7469a 100644 --- a/src/climbingboardgpt/tokenization.py +++ b/src/climbingboardgpt/tokenization.py @@ -66,7 +66,7 @@ def parse_tokens(value) -> list[str]: parsed = ast.literal_eval(value) if isinstance(parsed, list): return [str(v) for v in parsed] - except Exception: + except (SyntaxError, ValueError): pass return value.split()