docs: add performance and architectural notes, narrow exception handling

2026-05-29 14:47:33 -04:00
parent 69389d1ebf
commit 9874c19daa
3 changed files with 29 additions and 2 deletions
@@ -84,7 +84,17 @@ def nearest_real_route_same_board(
    generated_board_key: str,
    real_df: pd.DataFrame,
 ) -> dict[str, object]:
-    """Find the most similar real route on the same board by Jaccard score."""
+    """Find the most similar real route on the same board by Jaccard score.
    .. note::
       This function performs an O(n) linear scan over all real routes for
       the matching board, computing a Jaccard similarity for each one. With
       ~256K training examples, evaluating 400 generated routes costs roughly
       O(100M) Jaccard comparisons. This is acceptable for evaluation scripts
       but would not scale to a real-time or high-throughput setting without
       an approximate nearest-neighbour index.
    """
    board_frame = real_df[real_df["board_key"] == generated_board_key]
    if board_frame.empty:
        return {
@@ -85,6 +85,23 @@ class JointRouteGPT(nn.Module):
    PyTorch's ``TransformerEncoder`` is used with a causal mask, which makes it
    behave like a decoder-only language model for short route sequences.
    Why use ``TransformerEncoder`` rather than ``TransformerDecoder``?
    -------------------------------------------------------------------
    PyTorch's ``TransformerDecoderLayer`` expects two inputs: a decoder
    sequence and a separate encoder memory for cross-attention. For
    unconditional or prompt-conditioned generation there is no encoder,
    so ``TransformerDecoderLayer`` would always ignore the second input
    or require a dummy placeholder. Using ``TransformerEncoder`` with a
    causal mask avoids this mismatch, keeps the module list uniform,
    and produces identical behaviour for short autoregressive generation.
    The trade-off is that ``TransformerEncoder`` does not natively prevent
    attention to future positions — the causal mask must be constructed
    manually (see ``forward``). For the sequence lengths seen here
    (at most ~400 tokens) the overhead of the upper-triangular mask is
    negligible, and ``enable_nested_tensor=False`` is set to avoid SDPA
    optimisations that do not support masked encoders.
    """
    def __init__(
@@ -66,7 +66,7 @@ def parse_tokens(value) -> list[str]:
        parsed = ast.literal_eval(value)
        if isinstance(parsed, list):
            return [str(v) for v in parsed]
-    except Exception:
+    except (SyntaxError, ValueError):
        pass
    return value.split()