diff --git a/data/04_climb_features/feature_list.txt b/data/04_climb_features/feature_list.txt new file mode 100644 index 0000000..3568f63 --- /dev/null +++ b/data/04_climb_features/feature_list.txt @@ -0,0 +1,120 @@ +angle +total_holds +hand_holds +foot_holds +start_holds +finish_holds +middle_holds +is_nomatch +mean_x +mean_y +std_x +std_y +range_x +range_y +min_y +max_y +start_height +start_height_min +start_height_max +finish_height +finish_height_min +finish_height_max +height_gained +height_gained_start_finish +bbox_area +bbox_aspect_ratio +bbox_normalized_area +hold_density +holds_per_vertical_foot +left_holds +right_holds +left_ratio +symmetry_score +hand_left_ratio +hand_symmetry +upper_holds +lower_holds +upper_ratio +max_hand_reach +min_hand_reach +mean_hand_reach +std_hand_reach +hand_spread_x +hand_spread_y +max_foot_spread +mean_foot_spread +foot_spread_x +foot_spread_y +max_hand_to_foot +min_hand_to_foot +mean_hand_to_foot +std_hand_to_foot +mean_hold_difficulty +max_hold_difficulty +min_hold_difficulty +std_hold_difficulty +median_hold_difficulty +difficulty_range +mean_hand_difficulty +max_hand_difficulty +std_hand_difficulty +mean_foot_difficulty +max_foot_difficulty +std_foot_difficulty +start_difficulty +finish_difficulty +hand_foot_ratio +movement_density +hold_com_x +hold_com_y +weighted_difficulty +convex_hull_area +convex_hull_perimeter +hull_area_to_bbox_ratio +min_nn_distance +mean_nn_distance +max_nn_distance +std_nn_distance +mean_neighbors_12in +max_neighbors_12in +clustering_ratio +path_length_vertical +path_efficiency +difficulty_gradient +lower_region_difficulty +middle_region_difficulty +upper_region_difficulty +difficulty_progression +max_difficulty_jump +mean_difficulty_jump +difficulty_weighted_reach +max_weighted_reach +mean_x_normalized +mean_y_normalized +std_x_normalized +std_y_normalized +start_height_normalized +finish_height_normalized +start_offset_from_typical +finish_offset_from_typical +mean_y_relative_to_start +max_y_relative_to_start +spread_x_normalized +spread_y_normalized +bbox_coverage_x +bbox_coverage_y +y_q25 +y_q50 +y_q75 +y_iqr +holds_bottom_quartile +holds_top_quartile +display_difficulty +angle_x_holds +angle_x_difficulty +angle_squared +difficulty_x_height +difficulty_x_density +complexity_score +hull_area_x_difficulty diff --git a/images/03_hold_difficulty/difficulty_hand_40deg.png b/images/03_hold_difficulty/difficulty_hand_40deg.png index eff191c..83b12b9 100644 Binary files a/images/03_hold_difficulty/difficulty_hand_40deg.png and b/images/03_hold_difficulty/difficulty_hand_40deg.png differ diff --git a/images/03_hold_difficulty/difficulty_heatmap_angle_weighted_difficulty.png b/images/03_hold_difficulty/difficulty_heatmap_angle_weighted_difficulty.png index ead459c..41630ce 100644 Binary files a/images/03_hold_difficulty/difficulty_heatmap_angle_weighted_difficulty.png and b/images/03_hold_difficulty/difficulty_heatmap_angle_weighted_difficulty.png differ diff --git a/images/03_hold_difficulty/difficulty_heatmap_foot_overall_avg.png b/images/03_hold_difficulty/difficulty_heatmap_foot_overall_avg.png index 4d10989..4400f80 100644 Binary files a/images/03_hold_difficulty/difficulty_heatmap_foot_overall_avg.png and b/images/03_hold_difficulty/difficulty_heatmap_foot_overall_avg.png differ diff --git a/images/03_hold_difficulty/difficulty_heatmap_hand_overall_avg.png b/images/03_hold_difficulty/difficulty_heatmap_hand_overall_avg.png index 33f042a..252f165 100644 Binary files a/images/03_hold_difficulty/difficulty_heatmap_hand_overall_avg.png and b/images/03_hold_difficulty/difficulty_heatmap_hand_overall_avg.png differ diff --git a/images/03_hold_difficulty/difficulty_heatmap_overall_difficulty.png b/images/03_hold_difficulty/difficulty_heatmap_overall_difficulty.png index d370198..4ea37c3 100644 Binary files a/images/03_hold_difficulty/difficulty_heatmap_overall_difficulty.png and b/images/03_hold_difficulty/difficulty_heatmap_overall_difficulty.png differ diff --git a/images/04_climb_features/feature_correlations.png b/images/04_climb_features/feature_correlations.png new file mode 100644 index 0000000..c8e9133 Binary files /dev/null and b/images/04_climb_features/feature_correlations.png differ diff --git a/images/Kilter-original-16x12.png b/images/Kilter-original-16x12.png deleted file mode 100644 index d90af2c..0000000 Binary files a/images/Kilter-original-16x12.png and /dev/null differ diff --git a/notebooks/01_data_overview_and_climbing_statistics.ipynb b/notebooks/01_data_overview_and_climbing_statistics.ipynb index 5346b52..4e9ac39 100644 --- a/notebooks/01_data_overview_and_climbing_statistics.ipynb +++ b/notebooks/01_data_overview_and_climbing_statistics.ipynb @@ -5,7 +5,7 @@ "id": "37e8cfe9", "metadata": {}, "source": [ - "# Tension Board 2 / Tension Board 1: Data Overview and Climbing Statistics\n", + "Kilter Board: Data Overview and Climbing Statistics\n", "\n", "## Purpose\n", "\n", diff --git a/notebooks/04_feature_engineering.ipynb b/notebooks/04_feature_engineering.ipynb new file mode 100644 index 0000000..8ec02d4 --- /dev/null +++ b/notebooks/04_feature_engineering.ipynb @@ -0,0 +1,1237 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ed7f1a86", + "metadata": {}, + "source": [ + "# Tension Board 2 Mirror: Feature Engineering\n", + "\n", + "The goal of this notebook is to convert raw climb descriptions into a clean modelling table. Each row of the final table corresponds to a single climb-angle observation, and each column is a numeric feature that may help predict grade.\n", + "\n", + "## Modelling idea\n", + "\n", + "A climb's grade should depend on more than just angle. It should also depend on the geometry and sequencing of the holds used. To capture that, this notebook builds features from three sources:\n", + "\n", + "1. **Wall configuration** \n", + " Examples: angle, board geometry, mirrored placements.\n", + "\n", + "2. **Route structure** \n", + " Examples: number of holds, spatial spread, height gained, move lengths, left/right balance, and other frame-derived quantities.\n", + "\n", + "3. **Hold difficulty priors** \n", + " Examples: average, maximum, and distributional summaries of the empirical hold scores built in notebook 03.\n", + "\n", + "## Output\n", + "\n", + "The final product is a saved feature matrix that is reused in the predictive modelling and deep learning notebooks.\n", + "\n", + "## Notebook Structure\n", + "\n", + "1. [Setup and Imports](#setup-and-imports)\n", + "2. [Feature Extraction](#feature-extraction)\n", + "3. [Visualizing Key Features](#visualizing-key-features)\n", + "4. [Conclusion](#conclusion)" + ] + }, + { + "cell_type": "markdown", + "id": "ef5d85ef", + "metadata": {}, + "source": [ + "# Setup and Imports\n", + "\n", + "This section loads the database, auxiliary tables, and the hold-difficulty table produced in notebook 03.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "513d5c42", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "==================================\n", + "Setup and Imports\n", + "==================================\n", + "\"\"\"\n", + "\n", + "# Imports\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import numpy as np\n", + "import matplotlib.patches as mpatches\n", + "\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.model_selection import cross_val_score\n", + "\n", + "from scipy.spatial import ConvexHull\n", + "from scipy.spatial.distance import pdist, squareform\n", + "\n", + "import sqlite3\n", + "\n", + "import re\n", + "import os\n", + "from collections import defaultdict\n", + "\n", + "import ast\n", + "\n", + "from PIL import Image\n", + "\n", + "# Set some display options\n", + "pd.set_option('display.max_columns', None)\n", + "pd.set_option('display.max_rows', 100)\n", + "\n", + "# Set style\n", + "palette=['steelblue', 'coral', 'seagreen'] #(for multi-bar graphs)\n", + "\n", + "# Set board image for some visual analysis\n", + "board_img = Image.open('../images/kilter-original-16x12_compose.png')\n", + "\n", + "# Connect to the database\n", + "DB_PATH=\"../data/kilter.db\"\n", + "conn = sqlite3.connect(DB_PATH)\n", + "\n", + "# Create output directories\n", + "os.makedirs('../data/04_climb_features', exist_ok=True)\n", + "os.makedirs('../images/04_climb_features', exist_ok=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "04f9ccb8", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "==================================\n", + "Query our data from the DB\n", + "==================================\n", + "\n", + "We restrict to `layout_id=1` for the Kilter Board Original\n", + "\n", + "\"\"\"\n", + "\n", + "# Query climbs data\n", + "climbs_query = \"\"\"\n", + "SELECT\n", + " c.uuid,\n", + " c.name AS climb_name,\n", + " c.setter_username,\n", + " c.layout_id AS layout_id,\n", + " c.description,\n", + " c.is_nomatch,\n", + " c.is_listed,\n", + " l.name AS layout_name,\n", + " p.name AS board_name,\n", + " c.frames,\n", + " cs.angle,\n", + " cs.display_difficulty,\n", + " dg.boulder_name AS boulder_grade,\n", + " cs.ascensionist_count,\n", + " cs.quality_average,\n", + " cs.fa_at\n", + "FROM climbs c\n", + "JOIN layouts l ON c.layout_id = l.id\n", + "JOIN products p ON l.product_id = p.id\n", + "JOIN climb_stats cs ON c.uuid = cs.climb_uuid\n", + "JOIN difficulty_grades dg ON ROUND(cs.display_difficulty) = dg.difficulty\n", + "WHERE cs.display_difficulty IS NOT NULL AND c.is_listed=1 AND c.layout_id=1 AND cs.fa_at > '2016-01-01'\n", + "\"\"\"\n", + "\n", + "# Query information about placements (and their mirrors)\n", + "placements_query = \"\"\"\n", + "SELECT\n", + " p.id AS placement_id,\n", + " h.x,\n", + " h.y,\n", + " p.default_placement_role_id AS default_role_id,\n", + " p.set_id AS set_id,\n", + " s.name AS set_name\n", + "FROM placements p\n", + "JOIN holes h ON p.hole_id = h.id\n", + "JOIN sets s ON p.set_id = s.id\n", + "WHERE p.layout_id = 1 AND y <=156\n", + "\"\"\"\n", + "\n", + "# Load it into a DataFrame\n", + "df_climbs = pd.read_sql_query(climbs_query, conn)\n", + "df_placements = pd.read_sql_query(placements_query, conn)\n", + "\n", + "# Load the hold-level difficulty table created in notebook 03\n", + "df_hold_difficulty = pd.read_csv('../data/03_hold_difficulty/hold_difficulty_scores.csv', index_col='placement_id')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e5d93f9", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Difficulty-related columns loaded from Notebook 03:\")\n", + "print([c for c in df_hold_difficulty.columns if 'difficulty' in c.lower()])\n", + "\n", + "assert 'overall_difficulty' in df_hold_difficulty.columns, \"Missing overall_difficulty\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0c28b51", + "metadata": {}, + "outputs": [], + "source": [ + "df_hold_difficulty" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f54f7f6c", + "metadata": {}, + "outputs": [], + "source": [ + "placement_coords = {\n", + " row['placement_id']: (row['x'], row['y'])\n", + " for _, row in df_placements.iterrows()\n", + "}\n", + "\n", + "board_width = 192\n", + "board_height = 144\n", + "\n", + "x_min, x_max = -24, 168\n", + "y_min, y_max = 0, 156\n", + "\n", + "# Role definitions (TB2)\n", + "ROLE_DEFINITIONS = {\n", + " 'start': 12,\n", + " 'middle': 13,\n", + " 'finish': 14,\n", + " 'foot': 15\n", + "}\n", + "\n", + "HAND_ROLE_IDS = [12, 13, 14]\n", + "FOOT_ROLE_IDS = [15]\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38e865a4", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "==================================\n", + "Parse Frame function\n", + "==================================\n", + "\"\"\"\n", + "\n", + "def parse_frames(frames_str):\n", + " \"\"\"\n", + " Parse frames string into list of (placement_id, role_id) tuples.\n", + " \n", + " Parameters:\n", + " -----------\n", + " frames_str : str\n", + " Frame string like \"p1r5p2r6p3r8\"\n", + " \n", + " Returns:\n", + " --------\n", + " list of tuples: [(placement_id, role_id), ...]\n", + " \"\"\"\n", + " if not isinstance(frames_str, str):\n", + " return []\n", + " \n", + " matches = re.findall(r'p(\\d+)r(\\d+)', frames_str)\n", + " return [(int(p), int(r)) for p, r in matches]\n", + "\n", + "\n", + "def get_role_type(role_id):\n", + " \"\"\"Map role_id to role type string.\"\"\"\n", + " for role_type, rid in ROLE_DEFINITIONS.items():\n", + " if role_id == rid:\n", + " return role_type\n", + " return 'unknown'\n", + "\n", + "\n", + "# Test\n", + "test_frames = \"p1r5p2r6p3r8p4r5\"\n", + "parsed = parse_frames(test_frames)\n", + "print(f\"Test parse: {parsed}\")" + ] + }, + { + "cell_type": "markdown", + "id": "b2f2138a", + "metadata": {}, + "source": [ + "# Feature Extraction\n", + "\n", + "This is the core notebook section. The aim is to translate the raw `frames` string into a route-level numerical representation suitable for regression or classification models.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eeba545e", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "==================================\n", + "Feature Exraction Function\n", + "==================================\n", + "\"\"\"\n", + "\n", + "def extract_features(row, placement_coords, df_hold_difficulty):\n", + " \"\"\"\n", + " Extract all features from a single climb row.\n", + " \"\"\"\n", + " features = {}\n", + " \n", + " # Parse frames\n", + " holds = parse_frames(row['frames'])\n", + " angle = row['angle']\n", + " \n", + " if not holds:\n", + " return None\n", + " \n", + " # =====================\n", + " # BASIC HOLD EXTRACTION\n", + " # =====================\n", + " \n", + " hold_data = []\n", + " for placement_id, role_id in holds:\n", + " coords = placement_coords.get(placement_id, (None, None))\n", + " if coords[0] is None:\n", + " continue\n", + " \n", + " role_type = get_role_type(role_id)\n", + " is_hand = role_id in HAND_ROLE_IDS\n", + " is_foot = role_id in FOOT_ROLE_IDS\n", + " \n", + " # Get difficulty scores for this hold at this angle\n", + " diff_key = f\"{role_type}_diff_{int(angle)}deg\"\n", + " hand_diff_key = f\"hand_diff_{int(angle)}deg\"\n", + " foot_diff_key = f\"foot_diff_{int(angle)}deg\"\n", + " \n", + " difficulty = None\n", + " if placement_id in df_hold_difficulty.index:\n", + " # Try role-specific first, then aggregate\n", + " if diff_key in df_hold_difficulty.columns:\n", + " difficulty = df_hold_difficulty.loc[placement_id, diff_key]\n", + " if pd.isna(difficulty):\n", + " if is_hand and hand_diff_key in df_hold_difficulty.columns:\n", + " difficulty = df_hold_difficulty.loc[placement_id, hand_diff_key]\n", + " elif is_foot and foot_diff_key in df_hold_difficulty.columns:\n", + " difficulty = df_hold_difficulty.loc[placement_id, foot_diff_key]\n", + " \n", + " # Fallback to overall\n", + " if pd.isna(difficulty) and 'overall_difficulty' in df_hold_difficulty.columns:\n", + " difficulty = df_hold_difficulty.loc[placement_id, 'overall_difficulty']\n", + " \n", + " hold_data.append({\n", + " 'placement_id': placement_id,\n", + " 'x': coords[0],\n", + " 'y': coords[1],\n", + " 'role_id': role_id,\n", + " 'role_type': role_type,\n", + " 'is_hand': is_hand,\n", + " 'is_foot': is_foot,\n", + " 'difficulty': difficulty\n", + " })\n", + " \n", + " if not hold_data:\n", + " return None\n", + " \n", + " df_holds = pd.DataFrame(hold_data)\n", + " \n", + " # Separate by role\n", + " hand_holds = df_holds[df_holds['is_hand']]\n", + " foot_holds = df_holds[df_holds['is_foot']]\n", + " start_holds = df_holds[df_holds['role_type'] == 'start']\n", + " finish_holds = df_holds[df_holds['role_type'] == 'finish']\n", + " middle_holds = df_holds[df_holds['role_type'] == 'middle']\n", + " \n", + " # =====================\n", + " # 1. ANGLE\n", + " # =====================\n", + " features['angle'] = angle\n", + " \n", + " # =====================\n", + " # 2. BASIC COUNTS\n", + " # =====================\n", + " features['total_holds'] = len(df_holds)\n", + " features['hand_holds'] = len(hand_holds)\n", + " features['foot_holds'] = len(foot_holds)\n", + " features['start_holds'] = len(start_holds)\n", + " features['finish_holds'] = len(finish_holds)\n", + " features['middle_holds'] = len(middle_holds)\n", + " \n", + " # =====================\n", + " # 3. MATCHING FEATURE\n", + " # =====================\n", + " # A climb is \"matching\" if you are allowed to match your hands at any hold.\n", + " # There are slight difference in difficulties of matchines vs no matching climbs as per our analysis in 01.\n", + " features['is_nomatch'] = int((row['is_nomatch'] == 1) or bool(re.search(r'\\bno\\s*match(ing)?\\b', row['description'], flags=re.IGNORECASE)))\n", + " \n", + " # =====================\n", + " # 4. SPATIAL/POSITION\n", + " # =====================\n", + " xs = df_holds['x'].values\n", + " ys = df_holds['y'].values\n", + " \n", + " features['mean_x'] = np.mean(xs)\n", + " features['mean_y'] = np.mean(ys)\n", + " features['std_x'] = np.std(xs) if len(xs) > 1 else 0\n", + " features['std_y'] = np.std(ys) if len(ys) > 1 else 0\n", + " features['range_x'] = np.max(xs) - np.min(xs)\n", + " features['range_y'] = np.max(ys) - np.min(ys)\n", + " features['min_y'] = np.min(ys)\n", + " features['max_y'] = np.max(ys)\n", + " \n", + " # =====================\n", + " # 5. HEIGHT FEATURES\n", + " # =====================\n", + " if len(start_holds) > 0:\n", + " features['start_height'] = start_holds['y'].mean()\n", + " features['start_height_min'] = start_holds['y'].min()\n", + " features['start_height_max'] = start_holds['y'].max()\n", + " else:\n", + " features['start_height'] = np.nan\n", + " features['start_height_min'] = np.nan\n", + " features['start_height_max'] = np.nan\n", + " \n", + " if len(finish_holds) > 0:\n", + " features['finish_height'] = finish_holds['y'].mean()\n", + " features['finish_height_min'] = finish_holds['y'].min()\n", + " features['finish_height_max'] = finish_holds['y'].max()\n", + " else:\n", + " features['finish_height'] = np.nan\n", + " features['finish_height_min'] = np.nan\n", + " features['finish_height_max'] = np.nan\n", + " \n", + " features['height_gained'] = features['max_y'] - features['min_y']\n", + " \n", + " if pd.notna(features.get('finish_height')) and pd.notna(features.get('start_height')):\n", + " features['height_gained_start_finish'] = features['finish_height'] - features['start_height']\n", + " else:\n", + " features['height_gained_start_finish'] = np.nan\n", + " \n", + " # =====================\n", + " # 6. BBOX FEATURES\n", + " # =====================\n", + " bbox_width = features['range_x']\n", + " bbox_height = features['range_y']\n", + " features['bbox_area'] = bbox_width * bbox_height\n", + " features['bbox_aspect_ratio'] = bbox_width / bbox_height if bbox_height > 0 else 0\n", + " features['bbox_normalized_area'] = features['bbox_area'] / (board_width * board_height)\n", + " \n", + " # =====================\n", + " # 7. HOLD DENSITY\n", + " # =====================\n", + " if features['bbox_area'] > 0:\n", + " features['hold_density'] = features['total_holds'] / features['bbox_area']\n", + " else:\n", + " features['hold_density'] = 0\n", + " \n", + " features['holds_per_vertical_foot'] = features['total_holds'] / max(features['range_y'], 1)\n", + " \n", + " # =====================\n", + " # 8. SYMMETRY/BALANCE\n", + " # =====================\n", + " center_x = (x_min + x_max) / 2\n", + " features['left_holds'] = (df_holds['x'] < center_x).sum()\n", + " features['right_holds'] = (df_holds['x'] >= center_x).sum()\n", + " features['left_ratio'] = features['left_holds'] / features['total_holds'] if features['total_holds'] > 0 else 0.5\n", + " \n", + " # Symmetry score (how balanced left/right)\n", + " features['symmetry_score'] = 1 - abs(features['left_ratio'] - 0.5) * 2\n", + " \n", + " # Hand symmetry\n", + " if len(hand_holds) > 0:\n", + " hand_left = (hand_holds['x'] < center_x).sum()\n", + " hand_right = (hand_holds['x'] >= center_x).sum()\n", + " features['hand_left_ratio'] = hand_left / len(hand_holds)\n", + " features['hand_symmetry'] = 1 - abs(features['hand_left_ratio'] - 0.5) * 2\n", + " else:\n", + " features['hand_left_ratio'] = np.nan\n", + " features['hand_symmetry'] = np.nan\n", + " \n", + " # =====================\n", + " # 9. VERTICAL DISTRIBUTION\n", + " # =====================\n", + " y_median = np.median(ys)\n", + " features['upper_holds'] = (df_holds['y'] > y_median).sum()\n", + " features['lower_holds'] = (df_holds['y'] <= y_median).sum()\n", + " features['upper_ratio'] = features['upper_holds'] / features['total_holds']\n", + " \n", + " # =====================\n", + " # 10. HAND REACH / SPREAD\n", + " # =====================\n", + " if len(hand_holds) >= 2:\n", + " hand_xs = hand_holds['x'].values\n", + " hand_ys = hand_holds['y'].values\n", + " \n", + " hand_distances = []\n", + " for i in range(len(hand_holds)):\n", + " for j in range(i + 1, len(hand_holds)):\n", + " dx = hand_xs[i] - hand_xs[j]\n", + " dy = hand_ys[i] - hand_ys[j]\n", + " hand_distances.append(np.sqrt(dx**2 + dy**2))\n", + " \n", + " features['max_hand_reach'] = max(hand_distances)\n", + " features['min_hand_reach'] = min(hand_distances)\n", + " features['mean_hand_reach'] = np.mean(hand_distances)\n", + " features['std_hand_reach'] = np.std(hand_distances)\n", + " features['hand_spread_x'] = hand_xs.max() - hand_xs.min()\n", + " features['hand_spread_y'] = hand_ys.max() - hand_ys.min()\n", + " else:\n", + " features['max_hand_reach'] = 0\n", + " features['min_hand_reach'] = 0\n", + " features['mean_hand_reach'] = 0\n", + " features['std_hand_reach'] = 0\n", + " features['hand_spread_x'] = 0\n", + " features['hand_spread_y'] = 0\n", + " \n", + " # =====================\n", + " # 11. FOOT SPREAD\n", + " # =====================\n", + " if len(foot_holds) >= 2:\n", + " foot_xs = foot_holds['x'].values\n", + " foot_ys = foot_holds['y'].values\n", + " \n", + " foot_distances = []\n", + " for i in range(len(foot_holds)):\n", + " for j in range(i + 1, len(foot_holds)):\n", + " dx = foot_xs[i] - foot_xs[j]\n", + " dy = foot_ys[i] - foot_ys[j]\n", + " foot_distances.append(np.sqrt(dx**2 + dy**2))\n", + " \n", + " features['max_foot_spread'] = max(foot_distances)\n", + " features['mean_foot_spread'] = np.mean(foot_distances)\n", + " features['foot_spread_x'] = foot_xs.max() - foot_xs.min()\n", + " features['foot_spread_y'] = foot_ys.max() - foot_ys.min()\n", + " else:\n", + " features['max_foot_spread'] = 0\n", + " features['mean_foot_spread'] = 0\n", + " features['foot_spread_x'] = 0\n", + " features['foot_spread_y'] = 0\n", + " \n", + " # =====================\n", + " # 12. HAND-TO-FOOT DISTANCES\n", + " # =====================\n", + " if len(hand_holds) > 0 and len(foot_holds) > 0:\n", + " h2f_distances = []\n", + " for _, h in hand_holds.iterrows():\n", + " for _, f in foot_holds.iterrows():\n", + " dx = h['x'] - f['x']\n", + " dy = h['y'] - f['y']\n", + " h2f_distances.append(np.sqrt(dx**2 + dy**2))\n", + " \n", + " features['max_hand_to_foot'] = max(h2f_distances)\n", + " features['min_hand_to_foot'] = min(h2f_distances)\n", + " features['mean_hand_to_foot'] = np.mean(h2f_distances)\n", + " features['std_hand_to_foot'] = np.std(h2f_distances)\n", + " else:\n", + " features['max_hand_to_foot'] = 0\n", + " features['min_hand_to_foot'] = 0\n", + " features['mean_hand_to_foot'] = 0\n", + " features['std_hand_to_foot'] = 0\n", + " \n", + " # =====================\n", + " # 13. HOLD DIFFICULTY FEATURES\n", + " # =====================\n", + " difficulties = df_holds['difficulty'].dropna().values\n", + " \n", + " if len(difficulties) > 0:\n", + " features['mean_hold_difficulty'] = np.mean(difficulties)\n", + " features['max_hold_difficulty'] = np.max(difficulties)\n", + " features['min_hold_difficulty'] = np.min(difficulties)\n", + " features['std_hold_difficulty'] = np.std(difficulties)\n", + " features['median_hold_difficulty'] = np.median(difficulties)\n", + " features['difficulty_range'] = features['max_hold_difficulty'] - features['min_hold_difficulty']\n", + " else:\n", + " features['mean_hold_difficulty'] = np.nan\n", + " features['max_hold_difficulty'] = np.nan\n", + " features['min_hold_difficulty'] = np.nan\n", + " features['std_hold_difficulty'] = np.nan\n", + " features['median_hold_difficulty'] = np.nan\n", + " features['difficulty_range'] = np.nan\n", + " \n", + " # Hand difficulty\n", + " hand_diffs = hand_holds['difficulty'].dropna().values if len(hand_holds) > 0 else np.array([])\n", + " if len(hand_diffs) > 0:\n", + " features['mean_hand_difficulty'] = np.mean(hand_diffs)\n", + " features['max_hand_difficulty'] = np.max(hand_diffs)\n", + " features['std_hand_difficulty'] = np.std(hand_diffs)\n", + " else:\n", + " features['mean_hand_difficulty'] = np.nan\n", + " features['max_hand_difficulty'] = np.nan\n", + " features['std_hand_difficulty'] = np.nan\n", + " \n", + " # Foot difficulty\n", + " foot_diffs = foot_holds['difficulty'].dropna().values if len(foot_holds) > 0 else np.array([])\n", + " if len(foot_diffs) > 0:\n", + " features['mean_foot_difficulty'] = np.mean(foot_diffs)\n", + " features['max_foot_difficulty'] = np.max(foot_diffs)\n", + " features['std_foot_difficulty'] = np.std(foot_diffs)\n", + " else:\n", + " features['mean_foot_difficulty'] = np.nan\n", + " features['max_foot_difficulty'] = np.nan\n", + " features['std_foot_difficulty'] = np.nan\n", + " \n", + " # Start/Finish difficulty\n", + " start_diffs = start_holds['difficulty'].dropna().values if len(start_holds) > 0 else np.array([])\n", + " finish_diffs = finish_holds['difficulty'].dropna().values if len(finish_holds) > 0 else np.array([])\n", + " \n", + " features['start_difficulty'] = np.mean(start_diffs) if len(start_diffs) > 0 else np.nan\n", + " features['finish_difficulty'] = np.mean(finish_diffs) if len(finish_diffs) > 0 else np.nan\n", + " \n", + " # =====================\n", + " # 14. ADDITIONAL BASIC FEATURES\n", + " # =====================\n", + " \n", + " # Hand to foot ratio\n", + " features['hand_foot_ratio'] = features['hand_holds'] / max(features['foot_holds'], 1)\n", + " \n", + " # Movement complexity\n", + " features['movement_density'] = features['total_holds'] / max(features['height_gained'], 1)\n", + " \n", + " # Center of mass of holds\n", + " features['hold_com_x'] = np.average(xs, weights=None)\n", + " features['hold_com_y'] = np.average(ys, weights=None)\n", + " \n", + " # Weighted difficulty (by y position)\n", + " if len(difficulties) > 0 and len(ys) >= len(difficulties):\n", + " weights = (ys[:len(difficulties)] - ys.min()) / max(ys.max() - ys.min(), 1) + 0.5\n", + " features['weighted_difficulty'] = np.average(difficulties, weights=weights)\n", + " else:\n", + " features['weighted_difficulty'] = features['mean_hold_difficulty']\n", + " \n", + " # =====================================================\n", + " # 15. GEOMETRIC FEATURES\n", + " # =====================================================\n", + " \n", + " # Convex hull area (2D polygon enclosing all holds)\n", + " if len(df_holds) >= 3:\n", + " try:\n", + " points = np.column_stack([xs, ys])\n", + " hull = ConvexHull(points)\n", + " features['convex_hull_area'] = hull.volume # In 2D, volume = area\n", + " features['convex_hull_perimeter'] = hull.area # In 2D, area = perimeter\n", + " features['hull_area_to_bbox_ratio'] = features['convex_hull_area'] / max(features['bbox_area'], 1)\n", + " except:\n", + " features['convex_hull_area'] = np.nan\n", + " features['convex_hull_perimeter'] = np.nan\n", + " features['hull_area_to_bbox_ratio'] = np.nan\n", + " else:\n", + " features['convex_hull_area'] = 0\n", + " features['convex_hull_perimeter'] = 0\n", + " features['hull_area_to_bbox_ratio'] = 0\n", + " \n", + " # Nearest neighbor distances\n", + " if len(df_holds) >= 2:\n", + " points = np.column_stack([xs, ys])\n", + " distances = pdist(points)\n", + " \n", + " features['min_nn_distance'] = np.min(distances)\n", + " features['mean_nn_distance'] = np.mean(distances)\n", + " features['max_nn_distance'] = np.max(distances)\n", + " features['std_nn_distance'] = np.std(distances)\n", + " else:\n", + " features['min_nn_distance'] = 0\n", + " features['mean_nn_distance'] = 0\n", + " features['max_nn_distance'] = 0\n", + " features['std_nn_distance'] = 0\n", + " \n", + " # Clustering coefficient (holds grouped vs spread)\n", + " if len(df_holds) >= 3:\n", + " points = np.column_stack([xs, ys])\n", + " dist_matrix = squareform(pdist(points))\n", + " \n", + " # Count neighbors within threshold (e.g., 12 inches)\n", + " threshold = 12.0\n", + " neighbors_count = (dist_matrix < threshold).sum(axis=1) - 1 # Exclude self\n", + " features['mean_neighbors_12in'] = np.mean(neighbors_count)\n", + " features['max_neighbors_12in'] = np.max(neighbors_count)\n", + " \n", + " # Clustering: ratio of actual neighbors to max possible\n", + " avg_neighbors = np.mean(neighbors_count)\n", + " max_possible = len(df_holds) - 1\n", + " features['clustering_ratio'] = avg_neighbors / max_possible if max_possible > 0 else 0\n", + " else:\n", + " features['mean_neighbors_12in'] = 0\n", + " features['max_neighbors_12in'] = 0\n", + " features['clustering_ratio'] = 0\n", + " \n", + " # Path length (greedy nearest-neighbor tour)\n", + " if len(df_holds) >= 2:\n", + " # Sort by y (bottom to top) for approximate path\n", + " sorted_indices = np.argsort(ys)\n", + " sorted_points = np.column_stack([xs[sorted_indices], ys[sorted_indices]])\n", + " \n", + " path_length = 0\n", + " for i in range(len(sorted_points) - 1):\n", + " dx = sorted_points[i+1, 0] - sorted_points[i, 0]\n", + " dy = sorted_points[i+1, 1] - sorted_points[i, 1]\n", + " path_length += np.sqrt(dx**2 + dy**2)\n", + " \n", + " features['path_length_vertical'] = path_length\n", + " features['path_efficiency'] = features['height_gained'] / max(path_length, 1)\n", + " else:\n", + " features['path_length_vertical'] = 0\n", + " features['path_efficiency'] = 0\n", + " \n", + " # =====================================================\n", + " # 16. DIFFICULTY-WEIGHTED FEATURES\n", + " # =====================================================\n", + " \n", + " # Difficulty gradient (finish vs start)\n", + " if pd.notna(features.get('finish_difficulty')) and pd.notna(features.get('start_difficulty')):\n", + " features['difficulty_gradient'] = features['finish_difficulty'] - features['start_difficulty']\n", + " else:\n", + " features['difficulty_gradient'] = np.nan\n", + " \n", + " # Difficulty variance by vertical region (split into thirds)\n", + " if len(difficulties) > 0:\n", + " y_min_val, y_max_val = ys.min(), ys.max()\n", + " y_range = y_max_val - y_min_val\n", + " \n", + " if y_range > 0:\n", + " # Split into lower, middle, upper thirds\n", + " lower_mask = ys <= (y_min_val + y_range / 3)\n", + " middle_mask = (ys > y_min_val + y_range / 3) & (ys <= y_min_val + 2 * y_range / 3)\n", + " upper_mask = ys > (y_min_val + 2 * y_range / 3)\n", + " \n", + " # Get difficulties for each region\n", + " df_with_diff = df_holds.copy()\n", + " df_with_diff['lower'] = lower_mask\n", + " df_with_diff['middle'] = middle_mask\n", + " df_with_diff['upper'] = upper_mask\n", + " \n", + " lower_diffs = df_with_diff[df_with_diff['lower'] & df_with_diff['difficulty'].notna()]['difficulty']\n", + " middle_diffs = df_with_diff[df_with_diff['middle'] & df_with_diff['difficulty'].notna()]['difficulty']\n", + " upper_diffs = df_with_diff[df_with_diff['upper'] & df_with_diff['difficulty'].notna()]['difficulty']\n", + " \n", + " features['lower_region_difficulty'] = lower_diffs.mean() if len(lower_diffs) > 0 else np.nan\n", + " features['middle_region_difficulty'] = middle_diffs.mean() if len(middle_diffs) > 0 else np.nan\n", + " features['upper_region_difficulty'] = upper_diffs.mean() if len(upper_diffs) > 0 else np.nan\n", + " \n", + " # Difficulty progression (upper - lower)\n", + " if pd.notna(features['lower_region_difficulty']) and pd.notna(features['upper_region_difficulty']):\n", + " features['difficulty_progression'] = features['upper_region_difficulty'] - features['lower_region_difficulty']\n", + " else:\n", + " features['difficulty_progression'] = np.nan\n", + " else:\n", + " features['lower_region_difficulty'] = features['mean_hold_difficulty']\n", + " features['middle_region_difficulty'] = features['mean_hold_difficulty']\n", + " features['upper_region_difficulty'] = features['mean_hold_difficulty']\n", + " features['difficulty_progression'] = 0\n", + " else:\n", + " features['lower_region_difficulty'] = np.nan\n", + " features['middle_region_difficulty'] = np.nan\n", + " features['upper_region_difficulty'] = np.nan\n", + " features['difficulty_progression'] = np.nan\n", + " \n", + " # Hardest move estimate (max difficulty jump between consecutive holds)\n", + " if len(hand_holds) >= 2 and len(hand_diffs) >= 2:\n", + " # Sort hand holds by y position\n", + " hand_sorted = hand_holds.sort_values('y')\n", + " hand_diff_sorted = hand_sorted['difficulty'].dropna().values\n", + " \n", + " if len(hand_diff_sorted) >= 2:\n", + " difficulty_jumps = np.abs(np.diff(hand_diff_sorted))\n", + " features['max_difficulty_jump'] = np.max(difficulty_jumps) if len(difficulty_jumps) > 0 else 0\n", + " features['mean_difficulty_jump'] = np.mean(difficulty_jumps) if len(difficulty_jumps) > 0 else 0\n", + " else:\n", + " features['max_difficulty_jump'] = 0\n", + " features['mean_difficulty_jump'] = 0\n", + " else:\n", + " features['max_difficulty_jump'] = 0\n", + " features['mean_difficulty_jump'] = 0\n", + " \n", + " # Difficulty-weighted reach (combine difficulty with distances)\n", + " if len(hand_holds) >= 2 and len(hand_diffs) >= 2:\n", + " hand_sorted = hand_holds.sort_values('y')\n", + " xs_sorted = hand_sorted['x'].values\n", + " ys_sorted = hand_sorted['y'].values\n", + " diffs_sorted = hand_sorted['difficulty'].fillna(hand_diffs.mean()).values\n", + " \n", + " weighted_reach = []\n", + " for i in range(len(hand_sorted) - 1):\n", + " dx = xs_sorted[i+1] - xs_sorted[i]\n", + " dy = ys_sorted[i+1] - ys_sorted[i]\n", + " dist = np.sqrt(dx**2 + dy**2)\n", + " avg_diff = (diffs_sorted[i] + diffs_sorted[i+1]) / 2\n", + " weighted_reach.append(dist * avg_diff)\n", + " \n", + " features['difficulty_weighted_reach'] = np.mean(weighted_reach) if weighted_reach else 0\n", + " features['max_weighted_reach'] = np.max(weighted_reach) if weighted_reach else 0\n", + " else:\n", + " features['difficulty_weighted_reach'] = 0\n", + " features['max_weighted_reach'] = 0\n", + " \n", + " # =====================================================\n", + " # 17. POSITION-NORMALIZED FEATURES\n", + " # =====================================================\n", + " \n", + " # Normalized positions (0-1 scale)\n", + " features['mean_x_normalized'] = (features['mean_x'] - x_min) / board_width\n", + " features['mean_y_normalized'] = (features['mean_y'] - y_min) / board_height\n", + " features['std_x_normalized'] = features['std_x'] / board_width\n", + " features['std_y_normalized'] = features['std_y'] / board_height\n", + " \n", + " # Start/finish normalized\n", + " if pd.notna(features.get('start_height')):\n", + " features['start_height_normalized'] = (features['start_height'] - y_min) / board_height\n", + " else:\n", + " features['start_height_normalized'] = np.nan\n", + " \n", + " if pd.notna(features.get('finish_height')):\n", + " features['finish_height_normalized'] = (features['finish_height'] - y_min) / board_height\n", + " else:\n", + " features['finish_height_normalized'] = np.nan\n", + " \n", + " # Distance from typical positions (center bottom for start, center top for finish)\n", + " typical_start_y = y_min + board_height * 0.15\n", + " typical_finish_y = y_min + board_height * 0.85\n", + " \n", + " if pd.notna(features.get('start_height')):\n", + " features['start_offset_from_typical'] = abs(features['start_height'] - typical_start_y)\n", + " else:\n", + " features['start_offset_from_typical'] = np.nan\n", + " \n", + " if pd.notna(features.get('finish_height')):\n", + " features['finish_offset_from_typical'] = abs(features['finish_height'] - typical_finish_y)\n", + " else:\n", + " features['finish_offset_from_typical'] = np.nan\n", + " \n", + " # Hold positions relative to start\n", + " if len(start_holds) > 0:\n", + " start_y = start_holds['y'].mean()\n", + " features['mean_y_relative_to_start'] = features['mean_y'] - start_y\n", + " features['max_y_relative_to_start'] = features['max_y'] - start_y\n", + " else:\n", + " features['mean_y_relative_to_start'] = np.nan\n", + " features['max_y_relative_to_start'] = np.nan\n", + " \n", + " # Spread normalized by board\n", + " features['spread_x_normalized'] = features['range_x'] / board_width\n", + " features['spread_y_normalized'] = features['range_y'] / board_height\n", + " \n", + " # Bbox coverage (percentage of board covered)\n", + " features['bbox_coverage_x'] = features['range_x'] / board_width\n", + " features['bbox_coverage_y'] = features['range_y'] / board_height\n", + " \n", + " # Position quartile features\n", + " y_quartiles = np.percentile(ys, [25, 50, 75])\n", + " features['y_q25'] = y_quartiles[0]\n", + " features['y_q50'] = y_quartiles[1]\n", + " features['y_q75'] = y_quartiles[2]\n", + " features['y_iqr'] = y_quartiles[2] - y_quartiles[0]\n", + " \n", + " # Holds in each vertical quartile\n", + " features['holds_bottom_quartile'] = (ys < y_quartiles[0]).sum()\n", + " features['holds_top_quartile'] = (ys >= y_quartiles[2]).sum()\n", + " \n", + " return features" + ] + }, + { + "cell_type": "markdown", + "id": "e800c18b", + "metadata": {}, + "source": [ + "## Sanity Check on One Example\n", + "\n", + "Before extracting features for the entire dataset, we inspect one representative climb to confirm that the parsing logic and the computed geometric summaries behave as expected. Let's do the climb \"Ooo La La\" from notebook two.\n", + "\n", + "![Anna Got Me Clickin](../images/02_hold_stats/Anna_Got_Me_Clickin.png)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "573182a3", + "metadata": {}, + "outputs": [], + "source": [ + "extract_features(df_climbs.iloc[10000], placement_coords, df_placements)" + ] + }, + { + "cell_type": "markdown", + "id": "551b47ed", + "metadata": {}, + "source": [ + "The printed example above is an important checkpoint. If the parsed placements, role counts, or geometric summaries look unreasonable here, then the full feature matrix will inherit those mistakes.\n" + ] + }, + { + "cell_type": "markdown", + "id": "6df7451e", + "metadata": {}, + "source": [ + "## Extract Features or all climbs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ee9856b", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "==================================\n", + "Extract features for all climbs\n", + "==================================\n", + "\"\"\"\n", + "\n", + "from tqdm import tqdm # Progess bar. This will take a while.\n", + "\n", + "print(f\"Extracting features for {len(df_climbs)} climbs...\")\n", + "\n", + "feature_list = []\n", + "\n", + "for idx, row in tqdm(df_climbs.iterrows(), total=len(df_climbs)):\n", + " features = extract_features(row, placement_coords, df_hold_difficulty)\n", + " if features:\n", + " features['climb_uuid'] = row['uuid']\n", + " features['display_difficulty'] = row['display_difficulty']\n", + " feature_list.append(features)\n", + "\n", + "df_features = pd.DataFrame(feature_list)\n", + "df_features = df_features.set_index('climb_uuid')\n", + "\n", + "print(f\"\\nExtracted features for {len(df_features)} climbs\")\n", + "print(f\"Feature columns: {len(df_features.columns)}\")\n", + "\n", + "print(\"\\n### Feature Table Sample\\n\")\n", + "display(df_features.head(10))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dcbb5de5", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "==================================\n", + "Feature Summary Statistics\n", + "==================================\n", + "\"\"\"\n", + "\n", + "print(\"### Feature Summary\\n\")\n", + "\n", + "summary = df_features.describe().T\n", + "summary['missing'] = df_features.isna().sum()\n", + "summary['missing_pct'] = (df_features.isna().sum() / len(df_features) * 100).round(2)\n", + "\n", + "display(summary[['count', 'mean', 'std', 'min', 'max', 'missing', 'missing_pct']])" + ] + }, + { + "cell_type": "markdown", + "id": "bb2eb615", + "metadata": {}, + "source": [ + "## Correlation with Difficulty" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "668a506e", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "==================================\n", + "Correlation with Difficulty\n", + "==================================\n", + "\"\"\"\n", + "\n", + "correlations = df_features.corr()['display_difficulty'].drop('display_difficulty').sort_values(key=abs, ascending=False)\n", + "\n", + "print(\"### Top 30 Features Correlated with Difficulty\\n\")\n", + "display(correlations.head(30).to_frame('correlation'))\n", + "\n", + "print(\"\\n### Bottom 10 Features (Least Correlated)\\n\")\n", + "display(correlations.tail(10).to_frame('correlation'))" + ] + }, + { + "cell_type": "markdown", + "id": "95ef9547", + "metadata": {}, + "source": [ + "# Visualizing Key Features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25a55e53", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "==================================\n", + "Visualize Key Features\n", + "==================================\n", + "\"\"\"\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "fig, axes = plt.subplots(4, 4, figsize=(16, 16))\n", + "\n", + "key_features = [\n", + " 'angle',\n", + " 'total_holds',\n", + " 'height_gained',\n", + " 'mean_hold_difficulty',\n", + " 'max_hold_difficulty',\n", + " 'mean_hand_reach',\n", + " 'hold_density',\n", + " 'symmetry_score',\n", + " 'is_nomatch',\n", + " 'convex_hull_area',\n", + " 'difficulty_progression',\n", + " 'mean_y_normalized',\n", + " 'clustering_ratio',\n", + " 'path_efficiency',\n", + " 'max_difficulty_jump',\n", + " 'difficulty_weighted_reach'\n", + "]\n", + "\n", + "for ax, feature in zip(axes.flat, key_features):\n", + " if feature in df_features.columns:\n", + " ax.scatter(df_features[feature], df_features['display_difficulty'], alpha=0.3, s=10)\n", + " ax.set_xlabel(feature)\n", + " ax.set_ylabel('Difficulty')\n", + " ax.set_title(f'{feature} vs Difficulty')\n", + "\n", + "plt.tight_layout()\n", + "plt.savefig('../images/04_climb_features/feature_correlations.png', dpi=150, bbox_inches='tight')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d27cfcf7", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "==================================\n", + "Add Interaction Features\n", + "==================================\n", + "\"\"\"\n", + "\n", + "# Angle interactions\n", + "df_features['angle_x_holds'] = df_features['angle'] * df_features['total_holds']\n", + "df_features['angle_x_difficulty'] = df_features['angle'] * df_features['mean_hold_difficulty'].fillna(0)\n", + "df_features['angle_squared'] = df_features['angle'] ** 2\n", + "\n", + "# Difficulty interactions\n", + "df_features['difficulty_x_height'] = df_features['mean_hold_difficulty'].fillna(0) * df_features['height_gained']\n", + "df_features['difficulty_x_density'] = df_features['mean_hold_difficulty'].fillna(0) * df_features['hold_density']\n", + "\n", + "# Complexity features\n", + "df_features['complexity_score'] = (\n", + " df_features['total_holds'] * \n", + " df_features['mean_hand_reach'].fillna(0) * \n", + " df_features['hold_density']\n", + ")\n", + "\n", + "# Geometric × difficulty\n", + "df_features['hull_area_x_difficulty'] = df_features['convex_hull_area'].fillna(0) * df_features['mean_hold_difficulty'].fillna(0)\n", + "\n", + "print(f\"Added interaction features. Total columns: {len(df_features.columns)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f87892fd", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "==================================\n", + "Handle Missing Values\n", + "==================================\n", + "\"\"\"\n", + "\n", + "missing = df_features.isna().sum()\n", + "missing_cols = missing[missing > 0]\n", + "\n", + "print(\"### Columns with Missing Values\\n\")\n", + "display(missing_cols.to_frame('missing'))\n", + "\n", + "# Fill difficulty NaNs with column mean\n", + "difficulty_cols = [c for c in df_features.columns if 'difficulty' in c.lower()]\n", + "for col in difficulty_cols:\n", + " if df_features[col].isna().any():\n", + " df_features[col] = df_features[col].fillna(df_features[col].mean())\n", + "\n", + "# Fill start/finish height with min_y/max_y if missing\n", + "df_features['start_height'] = df_features['start_height'].fillna(df_features['min_y'])\n", + "df_features['finish_height'] = df_features['finish_height'].fillna(df_features['max_y'])\n", + "\n", + "# Fill normalized features\n", + "df_features['start_height_normalized'] = df_features['start_height_normalized'].fillna(\n", + " (df_features['start_height'] - y_min) / board_height\n", + ")\n", + "df_features['finish_height_normalized'] = df_features['finish_height_normalized'].fillna(\n", + " (df_features['finish_height'] - y_min) / board_height\n", + ")\n", + "\n", + "# Fill other NaNs with column means\n", + "for col in df_features.columns:\n", + " if df_features[col].isna().any():\n", + " if df_features[col].dtype in ['float64', 'int64']:\n", + " df_features[col] = df_features[col].fillna(df_features[col].mean())\n", + "\n", + "# Check remaining missing\n", + "remaining = df_features.isna().sum().sum()\n", + "print(f\"\\nRemaining missing values: {remaining}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed904eb3", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "===================================\n", + "Feature Importance Review\n", + "===================================\n", + "\"\"\"\n", + "\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.model_selection import cross_val_score\n", + "\n", + "X = df_features.drop(columns=['display_difficulty'])\n", + "y = df_features['display_difficulty']\n", + "\n", + "rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=3, n_jobs=-1)\n", + "rf.fit(X, y)\n", + "\n", + "importance = pd.DataFrame({\n", + " 'feature': X.columns,\n", + " 'importance': rf.feature_importances_\n", + "}).sort_values('importance', ascending=False)\n", + "\n", + "print(\"### Top 30 Most Important Features (Random Forest)\\n\")\n", + "display(importance.head(30))\n", + "\n", + "# Cross-validation score\n", + "scores = cross_val_score(rf, X, y, cv=5, scoring='neg_mean_absolute_error')\n", + "print(f\"\\nCross-validated MAE: {-scores.mean():.2f} (+/- {scores.std():.2f})\")" + ] + }, + { + "cell_type": "markdown", + "id": "547f7eb1", + "metadata": {}, + "source": [ + "# Conclusion" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f5f95c6", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "============================\n", + "Save Feature Matrix\n", + "============================\n", + "\"\"\"\n", + "raw_cols = [c for c in df_features.columns if c.endswith('_raw')]\n", + "if raw_cols:\n", + " print(\"Dropping raw columns from final climb feature matrix:\")\n", + " print(raw_cols)\n", + " df_features = df_features.drop(columns=raw_cols)\n", + "\n", + "# `climb_features.csv` is the canonical name used by later notebooks.\n", + "df_features.to_csv('../data/04_climb_features/climb_features.csv')\n", + "\n", + "print(\"Saved feature matrix to:\")\n", + "print(\" - ../data/04_climb_features/climb_features.csv\")\n", + "\n", + "with open('../data/04_climb_features/feature_list.txt', 'w') as f:\n", + " for col in df_features.columns:\n", + " f.write(f\"{col}\\n\")\n", + "\n", + "print(\"\\nFeature list saved to ../data/04_climb_features/feature_list.txt\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07d3e1dc", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "==================================\n", + "Final Feature Summary\n", + "==================================\n", + "\"\"\"\n", + "\n", + "print(\"### Feature Engineering Complete\\n\")\n", + "print(f\"Total climbs: {len(df_features)}\")\n", + "print(f\"Total features: {df_features.shape[1] - 1}\") # Exclude target\n", + "print(f\"Target: display_difficulty\")\n", + "print(f\"Feature matrix shape: {df_features.shape}\")\n", + "\n", + "print(\"\"\"\\nInterpretation:\n", + "- Each row is a climb-angle observation.\n", + "- The target is `display_difficulty`.\n", + "- The predictors combine geometry, hold statistics, and aggregate difficulty information.\n", + "- Hold-difficulty-based features use Bayesian-smoothed hold scores from Notebook 03.\n", + "- The next notebook tests how much predictive signal these engineered features actually contain.\n", + "\"\"\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.14.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/sql/01_data_exploration.sql b/sql/01_data_exploration.sql index 5d40719..e3ad94c 100644 --- a/sql/01_data_exploration.sql +++ b/sql/01_data_exploration.sql @@ -411,6 +411,21 @@ id |product_id|name |x |y|mirrored_hole_id|mirror_group| 1137| 1|31,KB1|124|4| 0| 0| * * These tell us the coordinates on the board. + * Let's see what our range is for the Kilter Board Original + */ + +SELECT + MIN(x) AS x_min, + MAX(x) AS x_max, + MIN(y) AS y_min, + MAX(y) AS y_max +FROM holes WHERE product_id=1; + +/* +x_min|x_max|y_min|y_max| +-----+-----+-----+-----+ + -20| 164| 4| 176| + * * * Lastly, let's look at leds. */ @@ -780,6 +795,15 @@ AND h.id NOT IN ( /* id|product_id|name|x|y|mirrored_hole_id|mirror_group| --+----------+----+-+-+----------------+------------+ + * + * Lastly, the following is important when we wish to visualize hold patterns. We need the edges. + */ +SELECT * FROM product_sizes ps WHERE id=28; +/* +id|product_id|edge_left|edge_right|edge_bottom|edge_top|name |description|image_filename |position|is_listed| +--+----------+---------+----------+-----------+--------+-------+-----------+--------------------+--------+---------+ +28| 1| -24| 168| 0| 156|16 x 12|Super Wide |product_sizes/28.png| 5| 1| + * */ --------------------------------------------------------------- @@ -791,7 +815,3 @@ id|product_id|name|x|y|mirrored_hole_id|mirror_group| * - Hold positions are decoded via mapping placements to (x,y) coordinates (from the holes tables) * - There are four hold types: start, middle, finish, foot. 692 holds on the Original (16x12) */ - - - -