ClimbingBoardGPT/scripts/04_evaluate_generated_routes.py

#!/usr/bin/env python3
"""
ClimbingBoardGPT — Generated Route Evaluation Script

This script evaluates routes generated by the GPT model on four dimensions:

1. Validity: Does the route follow structural rules?
   - At least 3 holds
   - No duplicate placements
   - At least one start and one finish hold
   - All holds from the same board

2. Novelty: Is the route different from existing climbs?
   - Measured by Jaccard distance from the nearest real route

3. Geometric plausibility: Are holds in reasonable positions?
   - Height, width, mean hand reach distance

4. Grade consistency: Does the route's predicted grade match the request?
   - Uses the trained grade predictor as a "critic"

This is analogous to how language models are evaluated using BLEU, ROUGE,
or human evaluation — but adapted for the climbing domain.

Usage:
    python scripts/04_evaluate_generated_routes.py
    python scripts/04_evaluate_generated_routes.py --grade-model-path models/joint_transformer_grade_predictor.pth
"""
from __future__ import annotations

import argparse
import sys
from pathlib import Path

REPO_ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(REPO_ROOT / "src"))

import numpy as np
import pandas as pd
import torch

from climbingboardgpt.evaluation import (
    build_placement_coords,
    frames_to_holds,
    holds_to_placement_set,
    nearest_real_route_same_board,
    parse_token_list,
    simple_route_features,
    tokens_to_hold_records,
    validity_from_records,
)
from climbingboardgpt.grades import to_grouped_v
from climbingboardgpt.models import JointRouteTransformerRegressor


def parse_args() -> argparse.Namespace:
    """Parse command-line arguments for route evaluation."""
    parser = argparse.ArgumentParser(
        description="Evaluate generated TB2/Kilter route candidates.",
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    parser.add_argument("--tokenized-dir", type=Path, default=REPO_ROOT / "data" / "processed" / "tokenized")
    parser.add_argument("--generated-dir", type=Path, default=REPO_ROOT / "data" / "processed" / "generation")
    parser.add_argument("--out-dir", type=Path, default=REPO_ROOT / "data" / "processed" / "evaluation")
    parser.add_argument("--grade-model-path", type=Path, default=REPO_ROOT / "models" / "joint_transformer_grade_predictor.pth")
    parser.add_argument("--device", type=str, default=None)
    return parser.parse_args()


def load_grade_critic(model_path: Path, device: torch.device):
    """Load the trained grade predictor model as a critic.

    The critic is used to predict the difficulty of generated routes.
    If we asked for V6 and the critic predicts V6 ± 1, the generation
    is grade-consistent.

    This is similar to how GANs use a discriminator, except our critic
    is a regression model rather than a binary classifier.

    Args:
        model_path: Path to the saved model checkpoint
        device: torch device

    Returns:
        Dictionary with model, vocabulary, and config, or None if not found
    """
    if not model_path.exists():
        return None
    try:
        checkpoint = torch.load(model_path, map_location=device, weights_only=False)
    except TypeError:
        checkpoint = torch.load(model_path, map_location=device)

    cfg = checkpoint["config"]
    stoi = {str(k): int(v) for k, v in checkpoint["stoi"].items()}
    coord_features = checkpoint["coord_features"]
    if not isinstance(coord_features, torch.Tensor):
        coord_features = torch.tensor(coord_features, dtype=torch.float32)

    model = JointRouteTransformerRegressor(
        vocab_size=cfg["vocab_size"],
        max_len=cfg["max_len"],
        coord_features=coord_features,
        d_model=cfg.get("d_model", 128),
        nhead=cfg.get("nhead", 4),
        num_layers=cfg.get("num_layers", 4),
        dim_feedforward=cfg.get("dim_feedforward", 256),
        dropout=cfg.get("dropout", 0.10),
        pad_id=cfg.get("pad_id", stoi["<PAD>"]),
    ).to(device)
    model.load_state_dict(checkpoint["model_state_dict"])
    model.eval()

    return {
        "model": model,
        "stoi": stoi,
        "pad_id": stoi["<PAD>"],
        "unk_id": stoi["<UNK>"],
        "max_len": cfg["max_len"],
    }


def predict_generated_grade(tokens: list[str], critic, device: torch.device) -> float:
    """Use the critic model to predict the difficulty of a generated route.

    Args:
        tokens: List of token strings (from generated route)
        critic: Dictionary with model and vocabulary
        device: torch device

    Returns:
        Predicted difficulty score (continuous value)
    """
    model = critic["model"]
    stoi = critic["stoi"]
    pad_id = critic["pad_id"]
    unk_id = critic["unk_id"]
    max_len = critic["max_len"]

    # Remove grade tokens (we want the model to predict, not see the grade)
    tokens = [token for token in tokens if not token.startswith("<GRADE_")]
    # Replace <BOS> with <CLS> for the encoder model
    if tokens and tokens[0] == "<BOS>":
        tokens = ["<CLS>"] + tokens[1:]
    else:
        tokens = ["<CLS>"] + tokens

    # Encode tokens to IDs and pad to max_len
    ids = [stoi.get(token, unk_id) for token in tokens][:max_len]
    mask = [1] * len(ids)
    if len(ids) < max_len:
        pad_n = max_len - len(ids)
        ids += [pad_id] * pad_n
        mask += [0] * pad_n

    with torch.no_grad():
        input_ids = torch.tensor([ids], dtype=torch.long, device=device)
        attention_mask = torch.tensor([mask], dtype=torch.bool, device=device)
        return float(model(input_ids, attention_mask).cpu().item())


def main() -> None:
    """Main evaluation pipeline.

    Steps:
    1. Load generated routes and real routes
    2. Parse tokens and check validity
    3. Compute novelty (Jaccard distance from nearest real route)
    4. Compute geometric features
    5. Optionally use critic model for grade consistency
    6. Rank routes by composite score
    7. Save evaluation results
    """
    args = parse_args()
    args.out_dir.mkdir(parents=True, exist_ok=True)

    # ─────────────────────────────────────────────────────────────────────
    # Step 1: Load data
    # ─────────────────────────────────────────────────────────────────────
    generated_path = args.generated_dir / "generated_routes.csv"
    routes_path = args.tokenized_dir / "route_sequences.csv"
    token_meta_path = args.tokenized_dir / "token_metadata.csv"

    if not generated_path.exists():
        raise FileNotFoundError("Missing generated routes. Run scripts/03_train_route_generator.py first.")
    if not routes_path.exists() or not token_meta_path.exists():
        raise FileNotFoundError("Missing tokenized artifacts. Run scripts/01_tokenize_routes.py first.")

    df_generated = pd.read_csv(generated_path)
    df_real = pd.read_csv(routes_path)
    df_token_meta = pd.read_csv(token_meta_path)

    # ─────────────────────────────────────────────────────────────────────
    # Step 2: Parse tokens and check validity
    # ─────────────────────────────────────────────────────────────────────
    # Validity checks ensure generated routes are structurally sound:
    # - basic_valid: ≥3 holds, no duplicates, has start+finish, one board
    # - strict_valid: basic_valid + has middle + ≥4 holds
    df_generated["tokens_parsed"] = df_generated["tokens"].apply(parse_token_list)
    df_generated["hold_records"] = df_generated["tokens_parsed"].apply(tokens_to_hold_records)
    df_generated["hold_set"] = df_generated["hold_records"].apply(
        lambda records: frozenset(int(record["placement_id"]) for record in records)
    )

    validity = pd.DataFrame(
        df_generated.apply(
            lambda row: validity_from_records(
                row["hold_records"],
                requested_board_prefix=row.get("requested_board_prefix"),
            ),
            axis=1,
        ).tolist()
    )
    df_eval = pd.concat([df_generated.reset_index(drop=True), validity], axis=1)

    print(f"Evaluated generated routes: {len(df_eval):,}")
    print("\nBasic validity by board:")
    print(df_eval.groupby("board_key")["basic_valid_eval"].mean())
    print("\nStrict validity by board:")
    print(df_eval.groupby("board_key")["strict_valid_eval"].mean())

    # ─────────────────────────────────────────────────────────────────────
    # Step 3: Novelty (Jaccard distance from nearest real route)
    # ─────────────────────────────────────────────────────────────────────
    # For each generated route, find the most similar real route on the
    # same board using Jaccard similarity of hold sets.
    # Novelty distance = 1 - Jaccard similarity
    # A value of 1.0 means completely novel (no shared holds)
    # A value of 0.0 means identical to an existing route
    df_real["real_holds"] = df_real["frames"].apply(frames_to_holds)
    df_real["hold_set"] = df_real["real_holds"].apply(holds_to_placement_set)

    nearest = pd.DataFrame(
        df_eval.apply(
            lambda row: nearest_real_route_same_board(
                generated_set=row["hold_set"],
                generated_board_key=row["board_key"],
                real_df=df_real,
            ),
            axis=1,
        ).tolist()
    )
    df_eval = pd.concat([df_eval, nearest], axis=1)

    print("\nNovelty statistics:")
    print(df_eval[["board_key", "nearest_real_jaccard", "novelty_distance"]].describe())

    # ─────────────────────────────────────────────────────────────────────
    # Step 4: Geometric features
    # ─────────────────────────────────────────────────────────────────────
    # Compute simple spatial features for each generated route:
    # - Number of holds
    # - Height gained (max Y - min Y)
    # - Width span (max X - min X)
    # - Mean hand reach distance
    coords = build_placement_coords(df_token_meta)
    geom = pd.DataFrame(
        df_eval.apply(
            lambda row: simple_route_features(
                board_key=row["board_key"],
                records=row["hold_records"],
                placement_coords=coords,
            ),
            axis=1,
        ).tolist()
    )
    df_eval = pd.concat([df_eval, geom], axis=1)

    print("\nGeometric feature statistics:")
    print(df_eval[["board_key", "geom_n_holds", "geom_height", "geom_width", "geom_mean_hand_reach"]].describe())

    # ─────────────────────────────────────────────────────────────────────
    # Step 5: Grade consistency (using critic model)
    # ─────────────────────────────────────────────────────────────────────
    # If a trained grade predictor is available, use it as a "critic"
    # to check whether generated routes have grades consistent with
    # what was requested.
    device = torch.device(args.device or ("cuda" if torch.cuda.is_available() else "cpu"))
    critic = load_grade_critic(args.grade_model_path, device)
    if critic is not None:
        print("\nUsing grade critic for consistency scoring...")
        df_eval["critic_pred_display_difficulty"] = df_eval["tokens_parsed"].apply(
            lambda tokens: predict_generated_grade(tokens, critic, device)
        )
        df_eval["critic_pred_grouped_v"] = df_eval["critic_pred_display_difficulty"].apply(to_grouped_v)
        df_eval["critic_v_error"] = df_eval["critic_pred_grouped_v"] - df_eval["requested_grouped_v"]

        print("\nCritic grade consistency by board:")
        summary = df_eval.groupby("board_key")["critic_v_error"].agg(
            exact=lambda s: float((s == 0).mean() * 100),
            within_1=lambda s: float((s.abs() <= 1).mean() * 100),
            within_2=lambda s: float((s.abs() <= 2).mean() * 100),
        )
        print(summary)
    else:
        print("No trained grade critic found. Skipping critic-based scoring.")

    # ─────────────────────────────────────────────────────────────────────
    # Step 6: Rank routes by composite score
    # ─────────────────────────────────────────────────────────────────────
    # The composite score rewards:
    # - Basic validity (weight 2.0)
    # - Strict validity (weight 1.0)
    # - Novelty (weight 1.0)
    # - Grade consistency (weight 1.0 for ±1 V-grade, penalty for larger errors)
    ranked = df_eval.copy()
    ranked["score"] = 0.0
    ranked["score"] += ranked["basic_valid_eval"].astype(float) * 2.0
    ranked["score"] += ranked["strict_valid_eval"].astype(float) * 1.0
    ranked["score"] += ranked["novelty_distance"].fillna(0.0)

    if "critic_v_error" in ranked.columns:
        ranked["score"] += (ranked["critic_v_error"].abs() <= 1).astype(float)
        ranked["score"] -= 0.25 * ranked["critic_v_error"].abs()

    top_candidates = ranked.sort_values("score", ascending=False).head(100).reset_index(drop=True)

    print(f"\nTop 10 generated routes by composite score:")
    display_cols = ["board_key", "score", "basic_valid_eval", "strict_valid_eval", "novelty_distance"]
    if "critic_v_error" in top_candidates.columns:
        display_cols.append("critic_v_error")
    print(top_candidates[display_cols].head(10))

    # ─────────────────────────────────────────────────────────────────────
    # Step 7: Save results
    # ─────────────────────────────────────────────────────────────────────
    df_eval.to_csv(args.out_dir / "generated_route_evaluation.csv", index=False)
    top_candidates.to_csv(args.out_dir / "top_generated_candidates.csv", index=False)

    print(f"\nSaved evaluation results to:")
    print(f"  {args.out_dir / 'generated_route_evaluation.csv'}")
    print(f"  {args.out_dir / 'top_generated_candidates.csv'}")


if __name__ == "__main__":
    main()