Next version. Models + scripts updated. 2

This commit is contained in:
Pawel
2026-05-21 22:21:26 -04:00
parent 0002ef1545
commit 86d582a572
23 changed files with 1768 additions and 293 deletions

View File

@@ -60,7 +60,7 @@ from climbingboardgpt.tokenization import (
make_placement_lookup,
vocab_payload,
)
from climbingboardgpt.utils import json_safe, safe_train_test_split, set_seed, write_json
from climbingboardgpt.utils import assign_group_splits, json_safe, set_seed, write_json
def parse_args() -> argparse.Namespace:
@@ -101,8 +101,8 @@ Examples:
parser.add_argument(
"--seed",
type=int,
default=42,
help="Random seed for reproducible splits (default: 42)",
default=3,
help="Random seed for reproducible splits (default: 3)",
)
return parser.parse_args()
@@ -244,41 +244,33 @@ def main() -> None:
df_routes["ids_with_grade"] = df_routes["tokens_with_grade"].apply(lambda tokens: encode(tokens, stoi))
df_routes["ids_no_grade"] = df_routes["tokens_no_grade"].apply(lambda tokens: encode(tokens, stoi))
# ─────────────────────────────────────────────────────────────────────
# Step 6: Train/val/test split (stratified)
# Step 6: Train/val/test split (grouped by logical climb)
# ─────────────────────────────────────────────────────────────────────
# We split 80/10/10, stratified by board_key × grouped_v.
# This ensures both boards and all difficulty levels are represented
# in each split, which is critical for fair evaluation.
# A single climb UUID can appear at multiple wall angles. We therefore
# split by (board_key, uuid), not by individual rows. This avoids putting
# one angle of a climb in train and another angle of the same climb in test.
#
# Stratification prevents scenarios like "all V14 climbs end up in
# the test set while training has none."
# The split is stratified by board_key × grouped_v at the group level when
# possible. The row proportions may differ slightly from 80/10/10 because
# some climbs have more angle entries than others, but this is preferable
# to route-level leakage or brittle UUID-overwrite logic.
df_routes["split_stratum"] = (
df_routes["board_key"].astype(str)
+ "__V"
+ df_routes["grouped_v"].astype(str)
)
train_df, temp_df = safe_train_test_split(
df_routes["split"] = assign_group_splits(
df_routes,
group_cols=["board_key", "uuid"],
test_size=0.20,
random_state=args.seed,
stratify_col="split_stratum",
)
val_df, test_df = safe_train_test_split(
temp_df,
test_size=0.50,
val_size_within_temp=0.50,
random_state=args.seed,
stratify_col="split_stratum",
)
split_map = {}
split_map.update({uuid: "train" for uuid in train_df["uuid"]})
split_map.update({uuid: "val" for uuid in val_df["uuid"]})
split_map.update({uuid: "test" for uuid in test_df["uuid"]})
df_routes["split"] = df_routes["uuid"].map(split_map)
print(f"\nSplit counts:")
print("\nSplit counts:")
print(df_routes.groupby(["board_key", "split"]).size().unstack(fill_value=0))
# ─────────────────────────────────────────────────────────────────────
@@ -357,4 +349,4 @@ def main() -> None:
if __name__ == "__main__":
main()
main()