fixed leakage

This commit is contained in:
Pawel Sarkowicz
2026-03-28 16:03:04 -04:00
parent 880272aaf5
commit 3ab9b77bb7
36 changed files with 2296 additions and 681 deletions

View File

@@ -19,8 +19,11 @@
"2. **Route structure** \n",
" Examples: number of holds, spatial spread, height gained, move lengths, left/right balance, and other frame-derived quantities.\n",
"\n",
"3. **Hold difficulty priors** \n",
" Examples: average, maximum, and distributional summaries of the empirical hold scores built in notebook 03.\n",
"When this was initially done, we added:\n",
"\n",
"3. **Hold difficulty priors** \n",
"\n",
"However, that makes it quite circular -- we'd be using the difficulty data to create difficulty scores to then predict difficulty data. The difficulty is already baked in there, so it is not a very good independent model. Heuristically, I don't think this is a big deal if we **just** want to predict V-grades, but we'll leave it out of our analysis in order to see what sorts of features actually help determine the difficulty of a climb.\n",
"\n",
"## Output\n",
"\n",
@@ -111,8 +114,8 @@
"Query our data from the DB\n",
"==================================\n",
"\n",
"We restrict to `layout_id=1` for the Kilter Board Original\n",
"\n",
"We restrict to `layout_id=1` for the Kilter Board Original.\n",
"Again, we set the date to be past 2016 for simplicity (dates start in 2018, with the exception of one in 2006).\n",
"\"\"\"\n",
"\n",
"# Query climbs data\n",
@@ -261,7 +264,7 @@
"\n",
"\n",
"# Test\n",
"test_frames = \"p1r5p2r6p3r8p4r5\"\n",
"test_frames = \"p1r12p2r13p3r14p4r15\"\n",
"parsed = parse_frames(test_frames)\n",
"print(f\"Test parse: {parsed}\")"
]
@@ -283,564 +286,212 @@
"metadata": {},
"outputs": [],
"source": [
"\"\"\"\n",
"==================================\n",
"Feature Exraction Function\n",
"==================================\n",
"\"\"\"\n",
"\n",
"def extract_features(row, placement_coords, df_hold_difficulty):\n",
"def extract_features(row, placement_coords):\n",
" \"\"\"\n",
" Extract all features from a single climb row.\n",
" Extract a trimmed set of clean geometric/spatial features.\n",
" No hold-difficulty-derived features are used.\n",
" \"\"\"\n",
" features = {}\n",
" \n",
" # Parse frames\n",
"\n",
" holds = parse_frames(row['frames'])\n",
" angle = row['angle']\n",
" \n",
"\n",
" if not holds:\n",
" return None\n",
" \n",
" # =====================\n",
" # BASIC HOLD EXTRACTION\n",
" # =====================\n",
" \n",
"\n",
" hold_data = []\n",
" for placement_id, role_id in holds:\n",
" coords = placement_coords.get(placement_id, (None, None))\n",
" if coords[0] is None:\n",
" continue\n",
" \n",
"\n",
" role_type = get_role_type(role_id)\n",
" is_hand = role_id in HAND_ROLE_IDS\n",
" is_foot = role_id in FOOT_ROLE_IDS\n",
" \n",
" # Get difficulty scores for this hold at this angle\n",
" diff_key = f\"{role_type}_diff_{int(angle)}deg\"\n",
" hand_diff_key = f\"hand_diff_{int(angle)}deg\"\n",
" foot_diff_key = f\"foot_diff_{int(angle)}deg\"\n",
" \n",
" difficulty = None\n",
" if placement_id in df_hold_difficulty.index:\n",
" # Try role-specific first, then aggregate\n",
" if diff_key in df_hold_difficulty.columns:\n",
" difficulty = df_hold_difficulty.loc[placement_id, diff_key]\n",
" if pd.isna(difficulty):\n",
" if is_hand and hand_diff_key in df_hold_difficulty.columns:\n",
" difficulty = df_hold_difficulty.loc[placement_id, hand_diff_key]\n",
" elif is_foot and foot_diff_key in df_hold_difficulty.columns:\n",
" difficulty = df_hold_difficulty.loc[placement_id, foot_diff_key]\n",
" \n",
" # Fallback to overall\n",
" if pd.isna(difficulty) and 'overall_difficulty' in df_hold_difficulty.columns:\n",
" difficulty = df_hold_difficulty.loc[placement_id, 'overall_difficulty']\n",
" \n",
"\n",
" hold_data.append({\n",
" 'placement_id': placement_id,\n",
" 'x': coords[0],\n",
" 'y': coords[1],\n",
" 'role_id': role_id,\n",
" 'role_type': role_type,\n",
" 'is_hand': is_hand,\n",
" 'is_foot': is_foot,\n",
" 'difficulty': difficulty\n",
" })\n",
" \n",
"\n",
" if not hold_data:\n",
" return None\n",
" \n",
"\n",
" df_holds = pd.DataFrame(hold_data)\n",
" \n",
" # Separate by role\n",
"\n",
" hand_holds = df_holds[df_holds['is_hand']]\n",
" foot_holds = df_holds[df_holds['is_foot']]\n",
" start_holds = df_holds[df_holds['role_type'] == 'start']\n",
" finish_holds = df_holds[df_holds['role_type'] == 'finish']\n",
" middle_holds = df_holds[df_holds['role_type'] == 'middle']\n",
" \n",
" # =====================\n",
" # 1. ANGLE\n",
" # =====================\n",
"\n",
" xs = df_holds['x'].to_numpy()\n",
" ys = df_holds['y'].to_numpy()\n",
"\n",
" description = row.get('description', '')\n",
" if pd.isna(description):\n",
" description = ''\n",
"\n",
" center_x = (x_min + x_max) / 2\n",
"\n",
" # Basic\n",
" features['angle'] = angle\n",
" \n",
" # =====================\n",
" # 2. BASIC COUNTS\n",
" # =====================\n",
" features['angle_squared'] = angle ** 2\n",
"\n",
" features['total_holds'] = len(df_holds)\n",
" features['hand_holds'] = len(hand_holds)\n",
" features['foot_holds'] = len(foot_holds)\n",
" features['start_holds'] = len(start_holds)\n",
" features['finish_holds'] = len(finish_holds)\n",
" features['middle_holds'] = len(middle_holds)\n",
" \n",
" # =====================\n",
" # 3. MATCHING FEATURE\n",
" # =====================\n",
" # A climb is \"matching\" if you are allowed to match your hands at any hold.\n",
" # There are slight difference in difficulties of matchines vs no matching climbs as per our analysis in 01.\n",
" features['is_nomatch'] = int((row['is_nomatch'] == 1) or bool(re.search(r'\\bno\\s*match(ing)?\\b', row['description'], flags=re.IGNORECASE)))\n",
" \n",
" # =====================\n",
" # 4. SPATIAL/POSITION\n",
" # =====================\n",
" xs = df_holds['x'].values\n",
" ys = df_holds['y'].values\n",
" \n",
" features['mean_x'] = np.mean(xs)\n",
"\n",
" features['is_nomatch'] = int(\n",
" (row['is_nomatch'] == 1) or\n",
" bool(re.search(r'\\bno\\s*match(ing)?\\b', description, flags=re.IGNORECASE))\n",
" )\n",
"\n",
" # Spatial\n",
" features['mean_y'] = np.mean(ys)\n",
" features['std_x'] = np.std(xs) if len(xs) > 1 else 0\n",
" features['std_y'] = np.std(ys) if len(ys) > 1 else 0\n",
" features['std_x'] = np.std(xs) if len(xs) > 1 else 0.0\n",
" features['std_y'] = np.std(ys) if len(ys) > 1 else 0.0\n",
" features['range_x'] = np.max(xs) - np.min(xs)\n",
" features['range_y'] = np.max(ys) - np.min(ys)\n",
" features['min_y'] = np.min(ys)\n",
" features['max_y'] = np.max(ys)\n",
" \n",
" # =====================\n",
" # 5. HEIGHT FEATURES\n",
" # =====================\n",
" if len(start_holds) > 0:\n",
" features['start_height'] = start_holds['y'].mean()\n",
" features['start_height_min'] = start_holds['y'].min()\n",
" features['start_height_max'] = start_holds['y'].max()\n",
" else:\n",
" features['start_height'] = np.nan\n",
" features['start_height_min'] = np.nan\n",
" features['start_height_max'] = np.nan\n",
" \n",
" if len(finish_holds) > 0:\n",
" features['finish_height'] = finish_holds['y'].mean()\n",
" features['finish_height_min'] = finish_holds['y'].min()\n",
" features['finish_height_max'] = finish_holds['y'].max()\n",
" else:\n",
" features['finish_height'] = np.nan\n",
" features['finish_height_min'] = np.nan\n",
" features['finish_height_max'] = np.nan\n",
" \n",
" features['height_gained'] = features['max_y'] - features['min_y']\n",
" \n",
" if pd.notna(features.get('finish_height')) and pd.notna(features.get('start_height')):\n",
" features['height_gained_start_finish'] = features['finish_height'] - features['start_height']\n",
" else:\n",
" features['height_gained_start_finish'] = np.nan\n",
" \n",
" # =====================\n",
" # 6. BBOX FEATURES\n",
" # =====================\n",
" bbox_width = features['range_x']\n",
" bbox_height = features['range_y']\n",
" features['bbox_area'] = bbox_width * bbox_height\n",
" features['bbox_aspect_ratio'] = bbox_width / bbox_height if bbox_height > 0 else 0\n",
" features['bbox_normalized_area'] = features['bbox_area'] / (board_width * board_height)\n",
" \n",
" # =====================\n",
" # 7. HOLD DENSITY\n",
" # =====================\n",
" if features['bbox_area'] > 0:\n",
" features['hold_density'] = features['total_holds'] / features['bbox_area']\n",
" else:\n",
" features['hold_density'] = 0\n",
" \n",
"\n",
" # Start / finish heights\n",
" start_height = start_holds['y'].mean() if len(start_holds) > 0 else np.nan\n",
" finish_height = finish_holds['y'].mean() if len(finish_holds) > 0 else np.nan\n",
"\n",
" features['height_gained_start_finish'] = (\n",
" finish_height - start_height\n",
" if pd.notna(start_height) and pd.notna(finish_height)\n",
" else np.nan\n",
" )\n",
"\n",
" # Density / symmetry\n",
" bbox_area = features['range_x'] * features['range_y']\n",
" features['bbox_area'] = bbox_area\n",
" features['hold_density'] = features['total_holds'] / bbox_area if bbox_area > 0 else 0.0\n",
" features['holds_per_vertical_foot'] = features['total_holds'] / max(features['range_y'], 1)\n",
" \n",
" # =====================\n",
" # 8. SYMMETRY/BALANCE\n",
" # =====================\n",
" center_x = (x_min + x_max) / 2\n",
" features['left_holds'] = (df_holds['x'] < center_x).sum()\n",
" features['right_holds'] = (df_holds['x'] >= center_x).sum()\n",
" features['left_ratio'] = features['left_holds'] / features['total_holds'] if features['total_holds'] > 0 else 0.5\n",
" \n",
" # Symmetry score (how balanced left/right)\n",
"\n",
" left_holds = (df_holds['x'] < center_x).sum()\n",
" features['left_ratio'] = left_holds / features['total_holds'] if features['total_holds'] > 0 else 0.5\n",
" features['symmetry_score'] = 1 - abs(features['left_ratio'] - 0.5) * 2\n",
" \n",
" # Hand symmetry\n",
" if len(hand_holds) > 0:\n",
" hand_left = (hand_holds['x'] < center_x).sum()\n",
" hand_right = (hand_holds['x'] >= center_x).sum()\n",
" features['hand_left_ratio'] = hand_left / len(hand_holds)\n",
" features['hand_symmetry'] = 1 - abs(features['hand_left_ratio'] - 0.5) * 2\n",
" else:\n",
" features['hand_left_ratio'] = np.nan\n",
" features['hand_symmetry'] = np.nan\n",
" \n",
" # =====================\n",
" # 9. VERTICAL DISTRIBUTION\n",
" # =====================\n",
"\n",
" y_median = np.median(ys)\n",
" features['upper_holds'] = (df_holds['y'] > y_median).sum()\n",
" features['lower_holds'] = (df_holds['y'] <= y_median).sum()\n",
" features['upper_ratio'] = features['upper_holds'] / features['total_holds']\n",
" \n",
" # =====================\n",
" # 10. HAND REACH / SPREAD\n",
" # =====================\n",
" upper_holds = (df_holds['y'] > y_median).sum()\n",
" features['upper_ratio'] = upper_holds / features['total_holds']\n",
"\n",
" # Hand reach\n",
" if len(hand_holds) >= 2:\n",
" hand_xs = hand_holds['x'].values\n",
" hand_ys = hand_holds['y'].values\n",
" \n",
" hand_distances = []\n",
" for i in range(len(hand_holds)):\n",
" for j in range(i + 1, len(hand_holds)):\n",
" dx = hand_xs[i] - hand_xs[j]\n",
" dy = hand_ys[i] - hand_ys[j]\n",
" hand_distances.append(np.sqrt(dx**2 + dy**2))\n",
" \n",
" features['max_hand_reach'] = max(hand_distances)\n",
" features['min_hand_reach'] = min(hand_distances)\n",
" features['mean_hand_reach'] = np.mean(hand_distances)\n",
" features['std_hand_reach'] = np.std(hand_distances)\n",
" features['hand_spread_x'] = hand_xs.max() - hand_xs.min()\n",
" features['hand_spread_y'] = hand_ys.max() - hand_ys.min()\n",
" hand_points = hand_holds[['x', 'y']].to_numpy()\n",
" hand_distances = pdist(hand_points)\n",
"\n",
" hand_xs = hand_holds['x'].to_numpy()\n",
" hand_ys = hand_holds['y'].to_numpy()\n",
"\n",
" features['mean_hand_reach'] = float(np.mean(hand_distances))\n",
" features['max_hand_reach'] = float(np.max(hand_distances))\n",
" features['std_hand_reach'] = float(np.std(hand_distances))\n",
" features['hand_spread_x'] = float(hand_xs.max() - hand_xs.min())\n",
" features['hand_spread_y'] = float(hand_ys.max() - hand_ys.min())\n",
" else:\n",
" features['max_hand_reach'] = 0\n",
" features['min_hand_reach'] = 0\n",
" features['mean_hand_reach'] = 0\n",
" features['std_hand_reach'] = 0\n",
" features['hand_spread_x'] = 0\n",
" features['hand_spread_y'] = 0\n",
" \n",
" # =====================\n",
" # 11. FOOT SPREAD\n",
" # =====================\n",
" if len(foot_holds) >= 2:\n",
" foot_xs = foot_holds['x'].values\n",
" foot_ys = foot_holds['y'].values\n",
" \n",
" foot_distances = []\n",
" for i in range(len(foot_holds)):\n",
" for j in range(i + 1, len(foot_holds)):\n",
" dx = foot_xs[i] - foot_xs[j]\n",
" dy = foot_ys[i] - foot_ys[j]\n",
" foot_distances.append(np.sqrt(dx**2 + dy**2))\n",
" \n",
" features['max_foot_spread'] = max(foot_distances)\n",
" features['mean_foot_spread'] = np.mean(foot_distances)\n",
" features['foot_spread_x'] = foot_xs.max() - foot_xs.min()\n",
" features['foot_spread_y'] = foot_ys.max() - foot_ys.min()\n",
" else:\n",
" features['max_foot_spread'] = 0\n",
" features['mean_foot_spread'] = 0\n",
" features['foot_spread_x'] = 0\n",
" features['foot_spread_y'] = 0\n",
" \n",
" # =====================\n",
" # 12. HAND-TO-FOOT DISTANCES\n",
" # =====================\n",
" features['mean_hand_reach'] = 0.0\n",
" features['max_hand_reach'] = 0.0\n",
" features['std_hand_reach'] = 0.0\n",
" features['hand_spread_x'] = 0.0\n",
" features['hand_spread_y'] = 0.0\n",
"\n",
" # Hand-foot distances\n",
" if len(hand_holds) > 0 and len(foot_holds) > 0:\n",
" h2f_distances = []\n",
" for _, h in hand_holds.iterrows():\n",
" for _, f in foot_holds.iterrows():\n",
" dx = h['x'] - f['x']\n",
" dy = h['y'] - f['y']\n",
" h2f_distances.append(np.sqrt(dx**2 + dy**2))\n",
" \n",
" features['max_hand_to_foot'] = max(h2f_distances)\n",
" features['min_hand_to_foot'] = min(h2f_distances)\n",
" features['mean_hand_to_foot'] = np.mean(h2f_distances)\n",
" features['std_hand_to_foot'] = np.std(h2f_distances)\n",
" hand_points = hand_holds[['x', 'y']].to_numpy()\n",
" foot_points = foot_holds[['x', 'y']].to_numpy()\n",
"\n",
" dists = []\n",
" for hx, hy in hand_points:\n",
" for fx, fy in foot_points:\n",
" dists.append(np.sqrt((hx - fx)**2 + (hy - fy)**2))\n",
" dists = np.asarray(dists)\n",
"\n",
" features['min_hand_to_foot'] = float(np.min(dists))\n",
" features['mean_hand_to_foot'] = float(np.mean(dists))\n",
" features['std_hand_to_foot'] = float(np.std(dists))\n",
" else:\n",
" features['max_hand_to_foot'] = 0\n",
" features['min_hand_to_foot'] = 0\n",
" features['mean_hand_to_foot'] = 0\n",
" features['std_hand_to_foot'] = 0\n",
" \n",
" # =====================\n",
" # 13. HOLD DIFFICULTY FEATURES\n",
" # =====================\n",
" difficulties = df_holds['difficulty'].dropna().values\n",
" \n",
" if len(difficulties) > 0:\n",
" features['mean_hold_difficulty'] = np.mean(difficulties)\n",
" features['max_hold_difficulty'] = np.max(difficulties)\n",
" features['min_hold_difficulty'] = np.min(difficulties)\n",
" features['std_hold_difficulty'] = np.std(difficulties)\n",
" features['median_hold_difficulty'] = np.median(difficulties)\n",
" features['difficulty_range'] = features['max_hold_difficulty'] - features['min_hold_difficulty']\n",
" else:\n",
" features['mean_hold_difficulty'] = np.nan\n",
" features['max_hold_difficulty'] = np.nan\n",
" features['min_hold_difficulty'] = np.nan\n",
" features['std_hold_difficulty'] = np.nan\n",
" features['median_hold_difficulty'] = np.nan\n",
" features['difficulty_range'] = np.nan\n",
" \n",
" # Hand difficulty\n",
" hand_diffs = hand_holds['difficulty'].dropna().values if len(hand_holds) > 0 else np.array([])\n",
" if len(hand_diffs) > 0:\n",
" features['mean_hand_difficulty'] = np.mean(hand_diffs)\n",
" features['max_hand_difficulty'] = np.max(hand_diffs)\n",
" features['std_hand_difficulty'] = np.std(hand_diffs)\n",
" else:\n",
" features['mean_hand_difficulty'] = np.nan\n",
" features['max_hand_difficulty'] = np.nan\n",
" features['std_hand_difficulty'] = np.nan\n",
" \n",
" # Foot difficulty\n",
" foot_diffs = foot_holds['difficulty'].dropna().values if len(foot_holds) > 0 else np.array([])\n",
" if len(foot_diffs) > 0:\n",
" features['mean_foot_difficulty'] = np.mean(foot_diffs)\n",
" features['max_foot_difficulty'] = np.max(foot_diffs)\n",
" features['std_foot_difficulty'] = np.std(foot_diffs)\n",
" else:\n",
" features['mean_foot_difficulty'] = np.nan\n",
" features['max_foot_difficulty'] = np.nan\n",
" features['std_foot_difficulty'] = np.nan\n",
" \n",
" # Start/Finish difficulty\n",
" start_diffs = start_holds['difficulty'].dropna().values if len(start_holds) > 0 else np.array([])\n",
" finish_diffs = finish_holds['difficulty'].dropna().values if len(finish_holds) > 0 else np.array([])\n",
" \n",
" features['start_difficulty'] = np.mean(start_diffs) if len(start_diffs) > 0 else np.nan\n",
" features['finish_difficulty'] = np.mean(finish_diffs) if len(finish_diffs) > 0 else np.nan\n",
" \n",
" # =====================\n",
" # 14. ADDITIONAL BASIC FEATURES\n",
" # =====================\n",
" \n",
" # Hand to foot ratio\n",
" features['hand_foot_ratio'] = features['hand_holds'] / max(features['foot_holds'], 1)\n",
" \n",
" # Movement complexity\n",
" features['movement_density'] = features['total_holds'] / max(features['height_gained'], 1)\n",
" \n",
" # Center of mass of holds\n",
" features['hold_com_x'] = np.average(xs, weights=None)\n",
" features['hold_com_y'] = np.average(ys, weights=None)\n",
" \n",
" # Weighted difficulty (by y position)\n",
" if len(difficulties) > 0 and len(ys) >= len(difficulties):\n",
" weights = (ys[:len(difficulties)] - ys.min()) / max(ys.max() - ys.min(), 1) + 0.5\n",
" features['weighted_difficulty'] = np.average(difficulties, weights=weights)\n",
" else:\n",
" features['weighted_difficulty'] = features['mean_hold_difficulty']\n",
" \n",
" # =====================================================\n",
" # 15. GEOMETRIC FEATURES\n",
" # =====================================================\n",
" \n",
" # Convex hull area (2D polygon enclosing all holds)\n",
" features['min_hand_to_foot'] = 0.0\n",
" features['mean_hand_to_foot'] = 0.0\n",
" features['std_hand_to_foot'] = 0.0\n",
"\n",
" # Global geometry\n",
" points = np.column_stack([xs, ys])\n",
"\n",
" if len(df_holds) >= 3:\n",
" try:\n",
" points = np.column_stack([xs, ys])\n",
" hull = ConvexHull(points)\n",
" features['convex_hull_area'] = hull.volume # In 2D, volume = area\n",
" features['convex_hull_perimeter'] = hull.area # In 2D, area = perimeter\n",
" features['hull_area_to_bbox_ratio'] = features['convex_hull_area'] / max(features['bbox_area'], 1)\n",
" except:\n",
" features['convex_hull_area'] = float(hull.volume)\n",
" features['hull_area_to_bbox_ratio'] = features['convex_hull_area'] / max(bbox_area, 1)\n",
" except Exception:\n",
" features['convex_hull_area'] = np.nan\n",
" features['convex_hull_perimeter'] = np.nan\n",
" features['hull_area_to_bbox_ratio'] = np.nan\n",
" else:\n",
" features['convex_hull_area'] = 0\n",
" features['convex_hull_perimeter'] = 0\n",
" features['hull_area_to_bbox_ratio'] = 0\n",
" \n",
" # Nearest neighbor distances\n",
" features['convex_hull_area'] = 0.0\n",
" features['hull_area_to_bbox_ratio'] = 0.0\n",
"\n",
" if len(df_holds) >= 2:\n",
" points = np.column_stack([xs, ys])\n",
" distances = pdist(points)\n",
" \n",
" features['min_nn_distance'] = np.min(distances)\n",
" features['mean_nn_distance'] = np.mean(distances)\n",
" features['max_nn_distance'] = np.max(distances)\n",
" features['std_nn_distance'] = np.std(distances)\n",
" pairwise = pdist(points)\n",
" features['mean_pairwise_distance'] = float(np.mean(pairwise))\n",
" features['std_pairwise_distance'] = float(np.std(pairwise))\n",
" else:\n",
" features['min_nn_distance'] = 0\n",
" features['mean_nn_distance'] = 0\n",
" features['max_nn_distance'] = 0\n",
" features['std_nn_distance'] = 0\n",
" \n",
" # Clustering coefficient (holds grouped vs spread)\n",
" if len(df_holds) >= 3:\n",
" points = np.column_stack([xs, ys])\n",
" dist_matrix = squareform(pdist(points))\n",
" \n",
" # Count neighbors within threshold (e.g., 12 inches)\n",
" threshold = 12.0\n",
" neighbors_count = (dist_matrix < threshold).sum(axis=1) - 1 # Exclude self\n",
" features['mean_neighbors_12in'] = np.mean(neighbors_count)\n",
" features['max_neighbors_12in'] = np.max(neighbors_count)\n",
" \n",
" # Clustering: ratio of actual neighbors to max possible\n",
" avg_neighbors = np.mean(neighbors_count)\n",
" max_possible = len(df_holds) - 1\n",
" features['clustering_ratio'] = avg_neighbors / max_possible if max_possible > 0 else 0\n",
" else:\n",
" features['mean_neighbors_12in'] = 0\n",
" features['max_neighbors_12in'] = 0\n",
" features['clustering_ratio'] = 0\n",
" \n",
" # Path length (greedy nearest-neighbor tour)\n",
" features['mean_pairwise_distance'] = 0.0\n",
" features['std_pairwise_distance'] = 0.0\n",
"\n",
" if len(df_holds) >= 2:\n",
" # Sort by y (bottom to top) for approximate path\n",
" sorted_indices = np.argsort(ys)\n",
" sorted_points = np.column_stack([xs[sorted_indices], ys[sorted_indices]])\n",
" \n",
" path_length = 0\n",
" sorted_idx = np.argsort(ys)\n",
" sorted_points = points[sorted_idx]\n",
"\n",
" path_length = 0.0\n",
" for i in range(len(sorted_points) - 1):\n",
" dx = sorted_points[i+1, 0] - sorted_points[i, 0]\n",
" dy = sorted_points[i+1, 1] - sorted_points[i, 1]\n",
" dx = sorted_points[i + 1, 0] - sorted_points[i, 0]\n",
" dy = sorted_points[i + 1, 1] - sorted_points[i, 1]\n",
" path_length += np.sqrt(dx**2 + dy**2)\n",
" \n",
"\n",
" features['path_length_vertical'] = path_length\n",
" features['path_efficiency'] = features['height_gained'] / max(path_length, 1)\n",
" else:\n",
" features['path_length_vertical'] = 0\n",
" features['path_efficiency'] = 0\n",
" \n",
" # =====================================================\n",
" # 16. DIFFICULTY-WEIGHTED FEATURES\n",
" # =====================================================\n",
" \n",
" # Difficulty gradient (finish vs start)\n",
" if pd.notna(features.get('finish_difficulty')) and pd.notna(features.get('start_difficulty')):\n",
" features['difficulty_gradient'] = features['finish_difficulty'] - features['start_difficulty']\n",
" else:\n",
" features['difficulty_gradient'] = np.nan\n",
" \n",
" # Difficulty variance by vertical region (split into thirds)\n",
" if len(difficulties) > 0:\n",
" y_min_val, y_max_val = ys.min(), ys.max()\n",
" y_range = y_max_val - y_min_val\n",
" \n",
" if y_range > 0:\n",
" # Split into lower, middle, upper thirds\n",
" lower_mask = ys <= (y_min_val + y_range / 3)\n",
" middle_mask = (ys > y_min_val + y_range / 3) & (ys <= y_min_val + 2 * y_range / 3)\n",
" upper_mask = ys > (y_min_val + 2 * y_range / 3)\n",
" \n",
" # Get difficulties for each region\n",
" df_with_diff = df_holds.copy()\n",
" df_with_diff['lower'] = lower_mask\n",
" df_with_diff['middle'] = middle_mask\n",
" df_with_diff['upper'] = upper_mask\n",
" \n",
" lower_diffs = df_with_diff[df_with_diff['lower'] & df_with_diff['difficulty'].notna()]['difficulty']\n",
" middle_diffs = df_with_diff[df_with_diff['middle'] & df_with_diff['difficulty'].notna()]['difficulty']\n",
" upper_diffs = df_with_diff[df_with_diff['upper'] & df_with_diff['difficulty'].notna()]['difficulty']\n",
" \n",
" features['lower_region_difficulty'] = lower_diffs.mean() if len(lower_diffs) > 0 else np.nan\n",
" features['middle_region_difficulty'] = middle_diffs.mean() if len(middle_diffs) > 0 else np.nan\n",
" features['upper_region_difficulty'] = upper_diffs.mean() if len(upper_diffs) > 0 else np.nan\n",
" \n",
" # Difficulty progression (upper - lower)\n",
" if pd.notna(features['lower_region_difficulty']) and pd.notna(features['upper_region_difficulty']):\n",
" features['difficulty_progression'] = features['upper_region_difficulty'] - features['lower_region_difficulty']\n",
" else:\n",
" features['difficulty_progression'] = np.nan\n",
" else:\n",
" features['lower_region_difficulty'] = features['mean_hold_difficulty']\n",
" features['middle_region_difficulty'] = features['mean_hold_difficulty']\n",
" features['upper_region_difficulty'] = features['mean_hold_difficulty']\n",
" features['difficulty_progression'] = 0\n",
" else:\n",
" features['lower_region_difficulty'] = np.nan\n",
" features['middle_region_difficulty'] = np.nan\n",
" features['upper_region_difficulty'] = np.nan\n",
" features['difficulty_progression'] = np.nan\n",
" \n",
" # Hardest move estimate (max difficulty jump between consecutive holds)\n",
" if len(hand_holds) >= 2 and len(hand_diffs) >= 2:\n",
" # Sort hand holds by y position\n",
" hand_sorted = hand_holds.sort_values('y')\n",
" hand_diff_sorted = hand_sorted['difficulty'].dropna().values\n",
" \n",
" if len(hand_diff_sorted) >= 2:\n",
" difficulty_jumps = np.abs(np.diff(hand_diff_sorted))\n",
" features['max_difficulty_jump'] = np.max(difficulty_jumps) if len(difficulty_jumps) > 0 else 0\n",
" features['mean_difficulty_jump'] = np.mean(difficulty_jumps) if len(difficulty_jumps) > 0 else 0\n",
" else:\n",
" features['max_difficulty_jump'] = 0\n",
" features['mean_difficulty_jump'] = 0\n",
" else:\n",
" features['max_difficulty_jump'] = 0\n",
" features['mean_difficulty_jump'] = 0\n",
" \n",
" # Difficulty-weighted reach (combine difficulty with distances)\n",
" if len(hand_holds) >= 2 and len(hand_diffs) >= 2:\n",
" hand_sorted = hand_holds.sort_values('y')\n",
" xs_sorted = hand_sorted['x'].values\n",
" ys_sorted = hand_sorted['y'].values\n",
" diffs_sorted = hand_sorted['difficulty'].fillna(hand_diffs.mean()).values\n",
" \n",
" weighted_reach = []\n",
" for i in range(len(hand_sorted) - 1):\n",
" dx = xs_sorted[i+1] - xs_sorted[i]\n",
" dy = ys_sorted[i+1] - ys_sorted[i]\n",
" dist = np.sqrt(dx**2 + dy**2)\n",
" avg_diff = (diffs_sorted[i] + diffs_sorted[i+1]) / 2\n",
" weighted_reach.append(dist * avg_diff)\n",
" \n",
" features['difficulty_weighted_reach'] = np.mean(weighted_reach) if weighted_reach else 0\n",
" features['max_weighted_reach'] = np.max(weighted_reach) if weighted_reach else 0\n",
" else:\n",
" features['difficulty_weighted_reach'] = 0\n",
" features['max_weighted_reach'] = 0\n",
" \n",
" # =====================================================\n",
" # 17. POSITION-NORMALIZED FEATURES\n",
" # =====================================================\n",
" \n",
" # Normalized positions (0-1 scale)\n",
" features['mean_x_normalized'] = (features['mean_x'] - x_min) / board_width\n",
" features['path_length_vertical'] = 0.0\n",
" features['path_efficiency'] = 0.0\n",
"\n",
" # Normalized / relative\n",
" features['mean_y_normalized'] = (features['mean_y'] - y_min) / board_height\n",
" features['std_x_normalized'] = features['std_x'] / board_width\n",
" features['std_y_normalized'] = features['std_y'] / board_height\n",
" \n",
" # Start/finish normalized\n",
" if pd.notna(features.get('start_height')):\n",
" features['start_height_normalized'] = (features['start_height'] - y_min) / board_height\n",
" else:\n",
" features['start_height_normalized'] = np.nan\n",
" \n",
" if pd.notna(features.get('finish_height')):\n",
" features['finish_height_normalized'] = (features['finish_height'] - y_min) / board_height\n",
" else:\n",
" features['finish_height_normalized'] = np.nan\n",
" \n",
" # Distance from typical positions (center bottom for start, center top for finish)\n",
" typical_start_y = y_min + board_height * 0.15\n",
" typical_finish_y = y_min + board_height * 0.85\n",
" \n",
" if pd.notna(features.get('start_height')):\n",
" features['start_offset_from_typical'] = abs(features['start_height'] - typical_start_y)\n",
" else:\n",
" features['start_offset_from_typical'] = np.nan\n",
" \n",
" if pd.notna(features.get('finish_height')):\n",
" features['finish_offset_from_typical'] = abs(features['finish_height'] - typical_finish_y)\n",
" else:\n",
" features['finish_offset_from_typical'] = np.nan\n",
" \n",
" # Hold positions relative to start\n",
" if len(start_holds) > 0:\n",
" start_y = start_holds['y'].mean()\n",
" features['mean_y_relative_to_start'] = features['mean_y'] - start_y\n",
" features['max_y_relative_to_start'] = features['max_y'] - start_y\n",
" else:\n",
" features['mean_y_relative_to_start'] = np.nan\n",
" features['max_y_relative_to_start'] = np.nan\n",
" \n",
" # Spread normalized by board\n",
" features['start_height_normalized'] = (\n",
" (start_height - y_min) / board_height if pd.notna(start_height) else np.nan\n",
" )\n",
" features['finish_height_normalized'] = (\n",
" (finish_height - y_min) / board_height if pd.notna(finish_height) else np.nan\n",
" )\n",
" features['mean_y_relative_to_start'] = (\n",
" features['mean_y'] - start_height if pd.notna(start_height) else np.nan\n",
" )\n",
" features['spread_x_normalized'] = features['range_x'] / board_width\n",
" features['spread_y_normalized'] = features['range_y'] / board_height\n",
" \n",
" # Bbox coverage (percentage of board covered)\n",
" features['bbox_coverage_x'] = features['range_x'] / board_width\n",
" features['bbox_coverage_y'] = features['range_y'] / board_height\n",
" \n",
" # Position quartile features\n",
" y_quartiles = np.percentile(ys, [25, 50, 75])\n",
" features['y_q25'] = y_quartiles[0]\n",
" features['y_q50'] = y_quartiles[1]\n",
" features['y_q75'] = y_quartiles[2]\n",
" features['y_iqr'] = y_quartiles[2] - y_quartiles[0]\n",
" \n",
" # Holds in each vertical quartile\n",
" features['holds_bottom_quartile'] = (ys < y_quartiles[0]).sum()\n",
" features['holds_top_quartile'] = (ys >= y_quartiles[2]).sum()\n",
" \n",
"\n",
" y_q75 = np.percentile(ys, 75)\n",
" y_q25 = np.percentile(ys, 25)\n",
" features['y_q75'] = y_q75\n",
" features['y_iqr'] = y_q75 - y_q25\n",
"\n",
" # Optional engineered clean feature\n",
" features['complexity_score'] = (\n",
" features['mean_hand_reach']\n",
" * np.log1p(features['total_holds'])\n",
" * (1 + features['hold_density'])\n",
" )\n",
"\n",
" return features"
]
},
@@ -851,7 +502,7 @@
"source": [
"## Sanity Check on One Example\n",
"\n",
"Before extracting features for the entire dataset, we inspect one representative climb to confirm that the parsing logic and the computed geometric summaries behave as expected. Let's do the climb \"Ooo La La\" from notebook two.\n",
"Before extracting features for the entire dataset, we inspect one representative climb to confirm that the parsing logic and the computed geometric summaries behave as expected. Let's do the climb \"Anna Got Me Clickin'\" from notebook two.\n",
"\n",
"![Anna Got Me Clickin](../images/02_hold_stats/Anna_Got_Me_Clickin.png)\n"
]
@@ -863,7 +514,7 @@
"metadata": {},
"outputs": [],
"source": [
"extract_features(df_climbs.iloc[10000], placement_coords, df_placements)"
"extract_features(df_climbs.iloc[10000], placement_coords)"
]
},
{
@@ -902,7 +553,7 @@
"feature_list = []\n",
"\n",
"for idx, row in tqdm(df_climbs.iterrows(), total=len(df_climbs)):\n",
" features = extract_features(row, placement_coords, df_hold_difficulty)\n",
" features = extract_features(row, placement_coords)\n",
" if features:\n",
" features['climb_uuid'] = row['uuid']\n",
" features['display_difficulty'] = row['display_difficulty']\n",
@@ -997,22 +648,37 @@
"fig, axes = plt.subplots(4, 4, figsize=(16, 16))\n",
"\n",
"key_features = [\n",
" # Core driver\n",
" 'angle',\n",
"\n",
" # Basic structure\n",
" 'total_holds',\n",
" 'height_gained',\n",
" 'mean_hold_difficulty',\n",
" 'max_hold_difficulty',\n",
" 'mean_hand_reach',\n",
"\n",
" # Density / compactness\n",
" 'hold_density',\n",
" 'holds_per_vertical_foot',\n",
"\n",
" # Hand geometry (very important)\n",
" 'mean_hand_reach',\n",
" 'max_hand_reach',\n",
" 'std_hand_reach',\n",
"\n",
" # Hand-foot interaction\n",
" 'mean_hand_to_foot',\n",
" 'std_hand_to_foot',\n",
"\n",
" # Spatial layout\n",
" 'symmetry_score',\n",
" 'is_nomatch',\n",
" 'upper_ratio',\n",
"\n",
" # Global geometry\n",
" 'convex_hull_area',\n",
" 'difficulty_progression',\n",
" 'mean_y_normalized',\n",
" 'clustering_ratio',\n",
" 'path_efficiency',\n",
" 'max_difficulty_jump',\n",
" 'difficulty_weighted_reach'\n",
" 'hull_area_to_bbox_ratio',\n",
"\n",
" # Path / flow\n",
" 'path_length_vertical',\n",
" 'path_efficiency'\n",
"]\n",
"\n",
"for ax, feature in zip(axes.flat, key_features):\n",
@@ -1042,12 +708,8 @@
"\n",
"# Angle interactions\n",
"df_features['angle_x_holds'] = df_features['angle'] * df_features['total_holds']\n",
"df_features['angle_x_difficulty'] = df_features['angle'] * df_features['mean_hold_difficulty'].fillna(0)\n",
"df_features['angle_squared'] = df_features['angle'] ** 2\n",
"\n",
"# Difficulty interactions\n",
"df_features['difficulty_x_height'] = df_features['mean_hold_difficulty'].fillna(0) * df_features['height_gained']\n",
"df_features['difficulty_x_density'] = df_features['mean_hold_difficulty'].fillna(0) * df_features['hold_density']\n",
"\n",
"# Complexity features\n",
"df_features['complexity_score'] = (\n",
@@ -1056,9 +718,6 @@
" df_features['hold_density']\n",
")\n",
"\n",
"# Geometric × difficulty\n",
"df_features['hull_area_x_difficulty'] = df_features['convex_hull_area'].fillna(0) * df_features['mean_hold_difficulty'].fillna(0)\n",
"\n",
"print(f\"Added interaction features. Total columns: {len(df_features.columns)}\")"
]
},
@@ -1081,23 +740,6 @@
"print(\"### Columns with Missing Values\\n\")\n",
"display(missing_cols.to_frame('missing'))\n",
"\n",
"# Fill difficulty NaNs with column mean\n",
"difficulty_cols = [c for c in df_features.columns if 'difficulty' in c.lower()]\n",
"for col in difficulty_cols:\n",
" if df_features[col].isna().any():\n",
" df_features[col] = df_features[col].fillna(df_features[col].mean())\n",
"\n",
"# Fill start/finish height with min_y/max_y if missing\n",
"df_features['start_height'] = df_features['start_height'].fillna(df_features['min_y'])\n",
"df_features['finish_height'] = df_features['finish_height'].fillna(df_features['max_y'])\n",
"\n",
"# Fill normalized features\n",
"df_features['start_height_normalized'] = df_features['start_height_normalized'].fillna(\n",
" (df_features['start_height'] - y_min) / board_height\n",
")\n",
"df_features['finish_height_normalized'] = df_features['finish_height_normalized'].fillna(\n",
" (df_features['finish_height'] - y_min) / board_height\n",
")\n",
"\n",
"# Fill other NaNs with column means\n",
"for col in df_features.columns:\n",
@@ -1206,8 +848,7 @@
"print(\"\"\"\\nInterpretation:\n",
"- Each row is a climb-angle observation.\n",
"- The target is `display_difficulty`.\n",
"- The predictors combine geometry, hold statistics, and aggregate difficulty information.\n",
"- Hold-difficulty-based features use Bayesian-smoothed hold scores from Notebook 03.\n",
"- The predictors combine geometry and structure\n",
"- The next notebook tests how much predictive signal these engineered features actually contain.\n",
"\"\"\")\n"
]