ClimbingBoardGPT/src/climbingboardgpt/evaluation.py

from __future__ import annotations

import ast
import re
from typing import Iterable

import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist

HOLD_TOKEN_PATTERN = re.compile(r"^<([A-Z0-9_]+)_p(\d+)_(start|middle|finish|foot|unknown)>$")


def parse_token_list(value) -> list[str]:
    if isinstance(value, list):
        return value
    if not isinstance(value, str):
        return []
    try:
        parsed = ast.literal_eval(value)
        if isinstance(parsed, list):
            return parsed
    except Exception:
        pass
    return value.split()


def tokens_to_hold_records(tokens: Iterable[str]) -> list[dict[str, object]]:
    rows = []
    for token in tokens:
        match = HOLD_TOKEN_PATTERN.match(token)
        if match is None:
            continue
        rows.append(
            {
                "token": token,
                "board_token_prefix": match.group(1),
                "placement_id": int(match.group(2)),
                "role": match.group(3),
            }
        )
    return rows


def validity_from_records(records: list[dict[str, object]], requested_board_prefix: str | None = None) -> dict[str, object]:
    placements = [int(record["placement_id"]) for record in records]
    roles = [str(record["role"]) for record in records]
    prefixes = [str(record["board_token_prefix"]) for record in records]
    one_board_only = len(set(prefixes)) <= 1
    matches_requested_board = requested_board_prefix is None or all(prefix == requested_board_prefix for prefix in prefixes)

    out = {
        "n_holds_eval": len(records),
        "n_unique_placements_eval": len(set(placements)),
        "has_duplicate_placements_eval": len(records) != len(set(placements)),
        "one_board_only_eval": one_board_only,
        "matches_requested_board_eval": matches_requested_board,
        "n_start_eval": roles.count("start"),
        "n_middle_eval": roles.count("middle"),
        "n_foot_eval": roles.count("foot"),
        "n_finish_eval": roles.count("finish"),
        "has_start_eval": "start" in roles,
        "has_middle_eval": "middle" in roles,
        "has_finish_eval": "finish" in roles,
    }
    out["basic_valid_eval"] = (
        one_board_only
        and out["n_holds_eval"] >= 3
        and out["n_holds_eval"] == out["n_unique_placements_eval"]
        and out["has_start_eval"]
        and out["has_finish_eval"]
    )
    out["strict_valid_eval"] = (
        out["basic_valid_eval"]
        and out["has_middle_eval"]
        and out["n_holds_eval"] >= 4
    )
    return out


def frames_to_holds(frames: str | None) -> list[tuple[int, int]]:
    if not isinstance(frames, str):
        return []
    return [(int(p), int(r)) for p, r in re.findall(r"p(\d+)r(\d+)", frames)]


def holds_to_placement_set(holds: Iterable[tuple[int, int]]) -> frozenset[int]:
    return frozenset(int(placement_id) for placement_id, _ in holds)


def jaccard(a: frozenset[int], b: frozenset[int]) -> float:
    if not a and not b:
        return 1.0
    if not a or not b:
        return 0.0
    return len(a & b) / len(a | b)


def nearest_real_route_same_board(
    generated_set: frozenset[int],
    generated_board_key: str,
    real_df: pd.DataFrame,
) -> dict[str, object]:
    board_frame = real_df[real_df["board_key"] == generated_board_key]
    best = {
        "nearest_real_jaccard": -1.0,
        "nearest_real_uuid": None,
        "nearest_real_name": None,
        "nearest_real_grouped_v": None,
        "nearest_real_angle": None,
    }

    for _, row in board_frame.iterrows():
        similarity = jaccard(generated_set, row["hold_set"])
        if similarity > best["nearest_real_jaccard"]:
            best.update(
                {
                    "nearest_real_jaccard": similarity,
                    "nearest_real_uuid": row["uuid"],
                    "nearest_real_name": row["climb_name"],
                    "nearest_real_grouped_v": row["grouped_v"],
                    "nearest_real_angle": row["angle"],
                }
            )

    best["novelty_distance"] = 1.0 - float(best["nearest_real_jaccard"])
    return best


def build_placement_coords(df_token_meta: pd.DataFrame) -> dict[tuple[str, int], dict[str, float]]:
    hold_meta = df_token_meta[df_token_meta["kind"] == "hold"].dropna(subset=["placement_id"]).copy()
    coords = {}
    for _, row in hold_meta.drop_duplicates(["board_key", "placement_id"]).iterrows():
        key = (str(row["board_key"]), int(row["placement_id"]))
        coords[key] = {
            "x": float(row["x"]),
            "y": float(row["y"]),
        }
    return coords


def simple_route_features(
    board_key: str,
    records: list[dict[str, object]],
    placement_coords: dict[tuple[str, int], dict[str, float]],
) -> dict[str, float]:
    rows = []
    for record in records:
        key = (str(board_key), int(record["placement_id"]))
        coord = placement_coords.get(key)
        if coord is None:
            continue
        x = float(coord["x"])
        y = float(coord["y"])
        if np.isnan(x) or np.isnan(y):
            continue
        role = str(record["role"])
        rows.append(
            {
                "x": x,
                "y": y,
                "role": role,
                "is_hand": role in {"start", "middle", "finish"},
                "is_foot": role == "foot",
            }
        )

    if not rows:
        return {
            "geom_n_holds": 0.0,
            "geom_height": np.nan,
            "geom_width": np.nan,
            "geom_mean_y": np.nan,
            "geom_mean_x_abs": np.nan,
            "geom_mean_hand_reach": np.nan,
            "geom_max_hand_reach": np.nan,
        }

    d = pd.DataFrame(rows)
    out = {
        "geom_n_holds": float(len(d)),
        "geom_height": float(d["y"].max() - d["y"].min()),
        "geom_width": float(d["x"].max() - d["x"].min()),
        "geom_mean_y": float(d["y"].mean()),
        "geom_mean_x_abs": float(d["x"].abs().mean()),
    }

    hands = d[d["is_hand"]].sort_values(["y", "x"])
    if len(hands) >= 2:
        distances = pdist(hands[["x", "y"]].values)
        out["geom_mean_hand_reach"] = float(distances.mean())
        out["geom_max_hand_reach"] = float(distances.max())
    else:
        out["geom_mean_hand_reach"] = np.nan
        out["geom_max_hand_reach"] = np.nan

    return out