docs: add performance and architectural notes, narrow exception handling
This commit is contained in:
@@ -84,7 +84,17 @@ def nearest_real_route_same_board(
|
|||||||
generated_board_key: str,
|
generated_board_key: str,
|
||||||
real_df: pd.DataFrame,
|
real_df: pd.DataFrame,
|
||||||
) -> dict[str, object]:
|
) -> dict[str, object]:
|
||||||
"""Find the most similar real route on the same board by Jaccard score."""
|
"""Find the most similar real route on the same board by Jaccard score.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
This function performs an O(n) linear scan over all real routes for
|
||||||
|
the matching board, computing a Jaccard similarity for each one. With
|
||||||
|
~256K training examples, evaluating 400 generated routes costs roughly
|
||||||
|
O(100M) Jaccard comparisons. This is acceptable for evaluation scripts
|
||||||
|
but would not scale to a real-time or high-throughput setting without
|
||||||
|
an approximate nearest-neighbour index.
|
||||||
|
"""
|
||||||
board_frame = real_df[real_df["board_key"] == generated_board_key]
|
board_frame = real_df[real_df["board_key"] == generated_board_key]
|
||||||
if board_frame.empty:
|
if board_frame.empty:
|
||||||
return {
|
return {
|
||||||
|
|||||||
@@ -85,6 +85,23 @@ class JointRouteGPT(nn.Module):
|
|||||||
|
|
||||||
PyTorch's ``TransformerEncoder`` is used with a causal mask, which makes it
|
PyTorch's ``TransformerEncoder`` is used with a causal mask, which makes it
|
||||||
behave like a decoder-only language model for short route sequences.
|
behave like a decoder-only language model for short route sequences.
|
||||||
|
|
||||||
|
Why use ``TransformerEncoder`` rather than ``TransformerDecoder``?
|
||||||
|
-------------------------------------------------------------------
|
||||||
|
PyTorch's ``TransformerDecoderLayer`` expects two inputs: a decoder
|
||||||
|
sequence and a separate encoder memory for cross-attention. For
|
||||||
|
unconditional or prompt-conditioned generation there is no encoder,
|
||||||
|
so ``TransformerDecoderLayer`` would always ignore the second input
|
||||||
|
or require a dummy placeholder. Using ``TransformerEncoder`` with a
|
||||||
|
causal mask avoids this mismatch, keeps the module list uniform,
|
||||||
|
and produces identical behaviour for short autoregressive generation.
|
||||||
|
|
||||||
|
The trade-off is that ``TransformerEncoder`` does not natively prevent
|
||||||
|
attention to future positions — the causal mask must be constructed
|
||||||
|
manually (see ``forward``). For the sequence lengths seen here
|
||||||
|
(at most ~400 tokens) the overhead of the upper-triangular mask is
|
||||||
|
negligible, and ``enable_nested_tensor=False`` is set to avoid SDPA
|
||||||
|
optimisations that do not support masked encoders.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
|||||||
@@ -66,7 +66,7 @@ def parse_tokens(value) -> list[str]:
|
|||||||
parsed = ast.literal_eval(value)
|
parsed = ast.literal_eval(value)
|
||||||
if isinstance(parsed, list):
|
if isinstance(parsed, list):
|
||||||
return [str(v) for v in parsed]
|
return [str(v) for v in parsed]
|
||||||
except Exception:
|
except (SyntaxError, ValueError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
return value.split()
|
return value.split()
|
||||||
|
|||||||
Reference in New Issue
Block a user