ClimbingBoardGPT/scripts/01_tokenize_routes.py

#!/usr/bin/env python3
"""
ClimbingBoardGPT — Route Tokenization Script

This script converts raw climbing route data from SQLite databases into
tokenized sequences suitable for training transformer models.

What is tokenization?
---------------------
In NLP, tokenization converts raw text into discrete symbols (tokens) that
a model can process. For example, GPT-2 uses Byte-Pair Encoding (BPE) to
split "climbing" into ["cl", "imb", "ing"].

For climbing routes, we tokenize differently:
- Each hold on the board becomes a unique token (e.g., <TB2_p344_start>)
- Board identity, angle, and grade become conditioning tokens
- Special tokens mark sequence boundaries (<BOS>, <EOS>, etc.)

The key insight: climbing routes ARE sequences, just like sentences. The
same transformer architectures that learn English grammar can learn "climb
grammar" — which holds tend to follow which, how start holds differ from
finish holds, etc.

This script:
1. Loads board configurations from JSON files
2. Queries SQLite databases for climb and placement data
3. Parses frame strings (e.g., "p344r5p369r6p603r7") into structured data
4. Maps board-specific role IDs to shared semantic roles
5. Canonicalizes hold order (starts first, then middles by Y, etc.)
6. Generates two token sequences per route:
   - with_grade: includes <GRADE_V6> for GPT training
   - without_grade: excludes grade for BERT-style prediction
7. Builds vocabulary, train/val/test splits, and saves all artifacts

Usage:
    python scripts/01_tokenize_routes.py --boards tb2,kilter
    python scripts/01_tokenize_routes.py --boards tb2
    python scripts/01_tokenize_routes.py --boards kilter
"""
from __future__ import annotations

import argparse
import json
import sys
from pathlib import Path

# Set up the project root so we can import our custom package
REPO_ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(REPO_ROOT / "src"))

import pandas as pd

from climbingboardgpt.config import load_board_configs, parse_board_keys
from climbingboardgpt.data import load_multi_board_data
from climbingboardgpt.tokenization import (
    build_route_records,
    build_token_metadata,
    build_vocab,
    encode,
    make_placement_lookup,
    vocab_payload,
)
from climbingboardgpt.utils import json_safe, safe_train_test_split, set_seed, write_json


def parse_args() -> argparse.Namespace:
    """Parse command-line arguments for the tokenization script.

    Key arguments:
        --boards: Which boards to tokenize (comma-separated). Default: "tb2,kilter"
        --out-dir: Where to save tokenized artifacts
        --seed: Random seed for reproducible train/val/test splits
    """
    parser = argparse.ArgumentParser(
        description="Tokenize TB2 and/or Kilter routes for ClimbingBoardGPT.",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Tokenize both boards (default)
  python scripts/01_tokenize_routes.py --boards tb2,kilter

  # Tokenize only TB2
  python scripts/01_tokenize_routes.py --boards tb2

  # Custom output directory
  python scripts/01_tokenize_routes.py --out-dir /path/to/output
        """,
    )
    parser.add_argument(
        "--boards",
        type=str,
        default="tb2,kilter",
        help="Comma-separated board config names (default: tb2,kilter)",
    )
    parser.add_argument(
        "--out-dir",
        type=Path,
        default=REPO_ROOT / "data" / "processed" / "tokenized",
        help="Output directory for tokenized artifacts",
    )
    parser.add_argument(
        "--seed",
        type=int,
        default=42,
        help="Random seed for reproducible splits (default: 42)",
    )
    return parser.parse_args()


def main() -> None:
    """Main entry point for route tokenization.

    This function orchestrates the entire tokenization pipeline:
    1. Load board configurations
    2. Query databases for raw climb and placement data
    3. Parse frames strings into structured hold records
    4. Build tokenized route records with canonical hold ordering
    5. Construct vocabulary from all unique tokens
    6. Split data into train/val/test sets (stratified by board × grade)
    7. Build token metadata (coordinates, roles, etc.)
    8. Save all artifacts to disk
    """
    args = parse_args()

    # Set random seed for reproducibility
    # This ensures train/val/test splits are the same across runs
    set_seed(args.seed)
    args.out_dir.mkdir(parents=True, exist_ok=True)

    # ─────────────────────────────────────────────────────────────────────
    # Step 1: Load board configurations
    # ─────────────────────────────────────────────────────────────────────
    # Each board has a JSON config file specifying:
    # - layout_id: Which layout in the database to use
    # - role_definitions: Maps semantic roles (start, middle, etc.) to numeric IDs
    # - max_angle: Filter out routes steeper than this
    # - token_prefix: Namespace for hold tokens (prevents ID collisions)
    #
    # This config-driven approach means adding a new board only requires
    # creating a new JSON file, not modifying code.
    board_keys = parse_board_keys(args.boards)
    configs = load_board_configs(board_keys)
    configs_by_key = {config.board_key: config for config in configs}
    configs_by_prefix = {config.token_prefix: config for config in configs}

    print(f"Loaded {len(configs)} board configuration(s):")
    for config in configs:
        print(f"  {config.display_name} (key={config.board_key}, prefix={config.token_prefix})")
        print(f"    layout_id={config.layout_id}, max_angle={config.max_angle}")
        print(f"    role_definitions={config.role_definitions}")

    # ─────────────────────────────────────────────────────────────────────
    # Step 2: Load raw data from SQLite databases
    # ─────────────────────────────────────────────────────────────────────
    # Each board has its own SQLite database containing:
    # - climbs table: route metadata (name, setter, frames string, etc.)
    # - climb_stats table: angle, difficulty, ascensionist count, quality
    # - placements table: physical hold positions and default roles
    # - holes table: (x, y) coordinates for each placement
    # - difficulty_grades table: mapping from numeric difficulty to V-grades
    #
    # The frames string is the core data — it encodes which holds are used
    # and their roles, e.g., "p344r5p369r6p603r7" means:
    #   placement 344 with role 5 (start)
    #   placement 369 with role 6 (middle)
    #   placement 603 with role 7 (finish)
    print("\nLoading data from databases...")
    df_climbs, df_placements = load_multi_board_data(configs, project_root=REPO_ROOT)
    placement_lookup = make_placement_lookup(df_placements)

    print(f"  Total climb-angle entries: {len(df_climbs):,}")
    print(f"  Total placements: {len(df_placements):,}")
    print(f"  Per board:")
    for board_key in df_climbs["board_key"].unique():
        n = (df_climbs["board_key"] == board_key).sum()
        print(f"    {board_key}: {n:,} entries")

    # ─────────────────────────────────────────────────────────────────────
    # Step 3: Build tokenized route records
    # ─────────────────────────────────────────────────────────────────────
    # This is the core tokenization step. For each climb:
    # 1. Parse the frames string into (placement_id, role_id) pairs
    # 2. Map role IDs to semantic names using board config
    # 3. Sort holds canonically: starts first, then middles by Y, etc.
    # 4. Generate two token sequences:
    #    - with_grade: <BOS> <BOARD_X> <ANGLE_Y> <GRADE_VZ> <holds...> <EOS>
    #    - without_grade: <BOS> <BOARD_X> <ANGLE_Y> <holds...> <EOS>
    #
    # The grade-included version is for the GPT generator (which conditions
    # on grade). The grade-excluded version is for the BERT-style predictor
    # (which must predict grade, not see it).
    print("\nBuilding tokenized route records...")
    df_routes = build_route_records(
        df_climbs=df_climbs,
        configs_by_key=configs_by_key,
        placement_lookup=placement_lookup,
    )
    if df_routes.empty:
        raise RuntimeError("No routes were tokenized. Check raw DBs and board configs.")

    print(f"  Tokenized routes: {len(df_routes):,}")
    print(f"  Per board:")
    for board_key in df_routes["board_key"].unique():
        n = (df_routes["board_key"] == board_key).sum()
        print(f"    {board_key}: {n:,} routes")

    # ─────────────────────────────────────────────────────────────────────
    # Step 4: Build the shared vocabulary
    # ─────────────────────────────────────────────────────────────────────
    # The vocabulary maps each unique token to an integer ID.
    # This is analogous to how GPT-2's tokenizer maps subwords to IDs.
    #
    # Vocabulary structure:
    # 1. Special tokens (IDs 0-5): <PAD>, <UNK>, <BOS>, <EOS>, <CLS>, <MASK>
    # 2. Board tokens: <BOARD_TB2>, <BOARD_KILTER>
    # 3. Angle tokens: <ANGLE_10>, <ANGLE_15>, ..., <ANGLE_55>
    # 4. Grade tokens: <GRADE_V0>, <GRADE_V1>, ..., <GRADE_V16>
    # 5. Hold tokens: <TB2_p344_start>, <KILTER_p1084_middle>, etc.
    #
    # Hold tokens are namespaced by board to prevent ID collisions.
    # TB2 placement 344 and Kilter placement 344 are different physical holds.
    print("\nBuilding vocabulary...")
    vocab_tokens, stoi, itos = build_vocab(df_routes)

    print(f"  Vocabulary size: {len(stoi):,}")
    special_count = sum(1 for t in vocab_tokens if t in ["<PAD>", "<UNK>", "<BOS>", "<EOS>", "<CLS>", "<MASK>"])
    board_count = sum(1 for t in vocab_tokens if t.startswith("<BOARD_"))
    angle_count = sum(1 for t in vocab_tokens if t.startswith("<ANGLE_"))
    grade_count = sum(1 for t in vocab_tokens if t.startswith("<GRADE_"))
    hold_count = sum(1 for t in vocab_tokens if "_p" in t)
    print(f"  Special tokens: {special_count}")
    print(f"  Board tokens: {board_count}")
    print(f"  Angle tokens: {angle_count}")
    print(f"  Grade tokens: {grade_count}")
    print(f"  Hold tokens: {hold_count}")

    # ─────────────────────────────────────────────────────────────────────
    # Step 5: Encode token sequences as integer IDs
    # ─────────────────────────────────────────────────────────────────────
    # Convert string tokens to integer IDs for model input.
    # This is the same as encoding text with a tokenizer:
    #   "The cat sat" → [464, 3797, 3290]
    #   "<BOS> <BOARD_TB2> <TB2_p344_start>" → [2, 6, 42]
    df_routes["ids_with_grade"] = df_routes["tokens_with_grade"].apply(lambda tokens: encode(tokens, stoi))
    df_routes["ids_no_grade"] = df_routes["tokens_no_grade"].apply(lambda tokens: encode(tokens, stoi))

    # ─────────────────────────────────────────────────────────────────────
    # Step 6: Train/val/test split (stratified)
    # ─────────────────────────────────────────────────────────────────────
    # We split 80/10/10, stratified by board_key × grouped_v.
    # This ensures both boards and all difficulty levels are represented
    # in each split, which is critical for fair evaluation.
    #
    # Stratification prevents scenarios like "all V14 climbs end up in
    # the test set while training has none."
    df_routes["split_stratum"] = (
        df_routes["board_key"].astype(str)
        + "__V"
        + df_routes["grouped_v"].astype(str)
    )

    train_df, temp_df = safe_train_test_split(
        df_routes,
        test_size=0.20,
        random_state=args.seed,
        stratify_col="split_stratum",
    )
    val_df, test_df = safe_train_test_split(
        temp_df,
        test_size=0.50,
        random_state=args.seed,
        stratify_col="split_stratum",
    )

    split_map = {}
    split_map.update({uuid: "train" for uuid in train_df["uuid"]})
    split_map.update({uuid: "val" for uuid in val_df["uuid"]})
    split_map.update({uuid: "test" for uuid in test_df["uuid"]})
    df_routes["split"] = df_routes["uuid"].map(split_map)

    print(f"\nSplit counts:")
    print(df_routes.groupby(["board_key", "split"]).size().unstack(fill_value=0))

    # ─────────────────────────────────────────────────────────────────────
    # Step 7: Build token metadata
    # ─────────────────────────────────────────────────────────────────────
    # Each token has associated metadata:
    # - kind: "special", "board", "angle", "grade", or "hold"
    # - For hold tokens: board_key, placement_id, role, x, y, x_norm, y_norm
    # - For angle tokens: the angle value
    # - For grade tokens: the V-grade value
    #
    # The coordinate features (x_norm, y_norm, is_hold) are injected into
    # the grade predictor model as additional embeddings alongside token
    # embeddings. This gives the model direct spatial information.
    print("\nBuilding token metadata...")
    df_token_meta = build_token_metadata(
        vocab_tokens=vocab_tokens,
        stoi=stoi,
        df_placements=df_placements,
        placement_lookup=placement_lookup,
        configs_by_prefix=configs_by_prefix,
    )
    print(f"  Token metadata rows: {len(df_token_meta):,}")

    # ─────────────────────────────────────────────────────────────────────
    # Step 8: Save all artifacts
    # ─────────────────────────────────────────────────────────────────────
    # Save multiple file formats for different use cases:
    # - CSV: Easy to load in pandas for analysis
    # - JSONL: Easy to stream for training
    # - JSON: Vocabulary mapping for model loading
    print("\nSaving artifacts...")
    jsonl_path = args.out_dir / "routes_tokenized.jsonl"
    with jsonl_path.open("w", encoding="utf-8") as handle:
        for record in df_routes.to_dict(orient="records"):
            handle.write(json.dumps(json_safe(record)) + "\n")

    csv_cols = [
        "uuid", "board_key", "board_display_name", "board_token_prefix", "board_token",
        "climb_name", "setter_username", "layout_id", "layout_name", "board_name",
        "frames", "angle", "display_difficulty", "grouped_v", "boulder_grade",
        "ascensionist_count", "quality_average", "fa_at",
        "n_holds", "n_start", "n_middle", "n_foot", "n_finish",
        "sequence_with_grade", "sequence_no_grade", "split",
    ]
    df_routes[csv_cols].to_csv(args.out_dir / "route_sequences.csv", index=False)
    df_placements.to_csv(args.out_dir / "placement_metadata.csv", index=False)
    df_token_meta.to_csv(args.out_dir / "token_metadata.csv", index=False)
    write_json(args.out_dir / "token_vocab.json", vocab_payload(stoi, itos, configs_by_key))

    # Board summary statistics
    board_summary = (
        df_routes.groupby("board_key")
        .agg(
            n_routes=("uuid", "count"),
            mean_angle=("angle", "mean"),
            mean_display_difficulty=("display_difficulty", "mean"),
            mean_holds=("n_holds", "mean"),
        )
        .reset_index()
    )
    board_summary.to_csv(args.out_dir / "board_summary.csv", index=False)

    print(f"\n{'='*60}")
    print(f"Tokenization complete!")
    print(f"{'='*60}")
    print(f"Boards: {board_keys}")
    print(f"Tokenized routes: {len(df_routes):,}")
    print(f"Vocabulary size: {len(stoi):,}")
    print(f"Split counts:")
    print(df_routes.groupby(["board_key", "split"]).size().unstack(fill_value=0))
    print(f"\nSaved artifacts to: {args.out_dir}")
    for f in sorted(args.out_dir.iterdir()):
        size_mb = f.stat().st_size / 1e6
        print(f"  {f.name} ({size_mb:.1f} MB)")


if __name__ == "__main__":
    main()