Tension Board 2 Mirror: Feature Engineering¶
The goal of this notebook is to convert raw climb descriptions into a clean modelling table. Each row of the final table corresponds to a single climb-angle observation, and each column is a numeric feature that may help predict grade.
Modelling idea¶
A climb's grade should depend on more than just angle. It should also depend on the geometry and sequencing of the holds used. To capture that, this notebook builds features from three sources:
Wall configuration
Examples: angle, board geometry, mirrored placements.Route structure
Examples: number of holds, spatial spread, height gained, move lengths, left/right balance, and other frame-derived quantities.Hold difficulty priors
Examples: average, maximum, and distributional summaries of the empirical hold scores built in notebook 03.
Output¶
The final product is a saved feature matrix that is reused in the predictive modelling and deep learning notebooks.
Notebook Structure¶
Setup and Imports¶
This section loads the database, auxiliary tables, and the hold-difficulty table produced in notebook 03.
"""
==================================
Setup and Imports
==================================
"""
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib.patches as mpatches
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from scipy.spatial import ConvexHull
from scipy.spatial.distance import pdist, squareform
import sqlite3
import re
import os
from collections import defaultdict
import ast
from PIL import Image
# Set some display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
# Set style
palette=['steelblue', 'coral', 'seagreen'] #(for multi-bar graphs)
# Set board image for some visual analysis
board_img = Image.open('../images/tb2_board_12x12_composite.png')
# Connect to the database
DB_PATH="../data/tb2.db"
conn = sqlite3.connect(DB_PATH)
# Create output directories
os.makedirs('../data/04_climb_features', exist_ok=True)
os.makedirs('../images/04_climb_features', exist_ok=True)
"""
==================================
Query our data from the DB
==================================
This time we restrict to where `layout_id=10` for the TB2 Mirror.
We will also restrict ourselves to an angle of at most 50, since according to our grade vs angle distribution in notebook 01, things start to look a bit weird past 50.
(Probably a bias towards climbers who can actually climb that steep). We will encode this directly into our query.
"""
# Query climbs data
climbs_query = """
SELECT
c.uuid,
c.name AS climb_name,
c.setter_username,
c.layout_id AS layout_id,
c.description,
c.is_nomatch,
c.is_listed,
l.name AS layout_name,
p.name AS board_name,
c.frames,
cs.angle,
cs.display_difficulty,
dg.boulder_name AS boulder_grade,
cs.ascensionist_count,
cs.quality_average,
cs.fa_at
FROM climbs c
JOIN layouts l ON c.layout_id = l.id
JOIN products p ON l.product_id = p.id
JOIN climb_stats cs ON c.uuid = cs.climb_uuid
JOIN difficulty_grades dg ON ROUND(cs.display_difficulty) = dg.difficulty
WHERE cs.display_difficulty IS NOT NULL AND c.is_listed=1 AND c.layout_id=10 AND cs.angle <= 50
"""
# Query information about placements (and their mirrors)
placements_query = """
SELECT
p.id AS placement_id,
h.x,
h.y,
p.default_placement_role_id AS default_role_id,
p.set_id AS set_id,
s.name AS set_name,
p_mirror.id AS mirror_placement_id
FROM placements p
JOIN holes h ON p.hole_id = h.id
JOIN sets s ON p.set_id = s.id
LEFT JOIN holes h_mirror ON h.mirrored_hole_id = h_mirror.id
LEFT JOIN placements p_mirror ON p_mirror.hole_id = h_mirror.id AND p_mirror.layout_id = p.layout_id
WHERE p.layout_id = 10
"""
# Load it into a DataFrame
df_climbs = pd.read_sql_query(climbs_query, conn)
df_placements = pd.read_sql_query(placements_query, conn)
# Load the hold-level difficulty table created in notebook 03
df_hold_difficulty = pd.read_csv('../data/03_hold_difficulty/hold_difficulty_scores.csv', index_col='placement_id')
print("Difficulty-related columns loaded from Notebook 03:")
print([c for c in df_hold_difficulty.columns if 'difficulty' in c.lower()])
assert 'overall_difficulty' in df_hold_difficulty.columns, "Missing overall_difficulty"
df_hold_difficulty
placement_coords = {
row['placement_id']: (row['x'], row['y'])
for _, row in df_placements.iterrows()
}
board_width = 144
board_height = 144
x_min, x_max = -68, 68
y_min, y_max = 0, 144
# Role definitions (TB2)
ROLE_DEFINITIONS = {
'start': 5,
'middle': 6,
'finish': 7,
'foot': 8
}
HAND_ROLE_IDS = [5, 6, 7]
FOOT_ROLE_IDS = [8]
"""
==================================
Parse Frame function
==================================
"""
def parse_frames(frames_str):
"""
Parse frames string into list of (placement_id, role_id) tuples.
Parameters:
-----------
frames_str : str
Frame string like "p1r5p2r6p3r8"
Returns:
--------
list of tuples: [(placement_id, role_id), ...]
"""
if not isinstance(frames_str, str):
return []
matches = re.findall(r'p(\d+)r(\d+)', frames_str)
return [(int(p), int(r)) for p, r in matches]
def get_role_type(role_id):
"""Map role_id to role type string."""
for role_type, rid in ROLE_DEFINITIONS.items():
if role_id == rid:
return role_type
return 'unknown'
# Test
test_frames = "p1r5p2r6p3r8p4r5"
parsed = parse_frames(test_frames)
print(f"Test parse: {parsed}")
Feature Extraction¶
This is the core notebook section. The aim is to translate the raw frames string into a route-level numerical representation suitable for regression or classification models.
"""
==================================
Feature Exraction Function
==================================
"""
def extract_features(row, placement_coords, df_hold_difficulty):
"""
Extract all features from a single climb row.
"""
features = {}
# Parse frames
holds = parse_frames(row['frames'])
angle = row['angle']
if not holds:
return None
# =====================
# BASIC HOLD EXTRACTION
# =====================
hold_data = []
for placement_id, role_id in holds:
coords = placement_coords.get(placement_id, (None, None))
if coords[0] is None:
continue
role_type = get_role_type(role_id)
is_hand = role_id in HAND_ROLE_IDS
is_foot = role_id in FOOT_ROLE_IDS
# Get difficulty scores for this hold at this angle
diff_key = f"{role_type}_diff_{int(angle)}deg"
hand_diff_key = f"hand_diff_{int(angle)}deg"
foot_diff_key = f"foot_diff_{int(angle)}deg"
difficulty = None
if placement_id in df_hold_difficulty.index:
# Try role-specific first, then aggregate
if diff_key in df_hold_difficulty.columns:
difficulty = df_hold_difficulty.loc[placement_id, diff_key]
if pd.isna(difficulty):
if is_hand and hand_diff_key in df_hold_difficulty.columns:
difficulty = df_hold_difficulty.loc[placement_id, hand_diff_key]
elif is_foot and foot_diff_key in df_hold_difficulty.columns:
difficulty = df_hold_difficulty.loc[placement_id, foot_diff_key]
# Fallback to overall
if pd.isna(difficulty) and 'overall_difficulty' in df_hold_difficulty.columns:
difficulty = df_hold_difficulty.loc[placement_id, 'overall_difficulty']
hold_data.append({
'placement_id': placement_id,
'x': coords[0],
'y': coords[1],
'role_id': role_id,
'role_type': role_type,
'is_hand': is_hand,
'is_foot': is_foot,
'difficulty': difficulty
})
if not hold_data:
return None
df_holds = pd.DataFrame(hold_data)
# Separate by role
hand_holds = df_holds[df_holds['is_hand']]
foot_holds = df_holds[df_holds['is_foot']]
start_holds = df_holds[df_holds['role_type'] == 'start']
finish_holds = df_holds[df_holds['role_type'] == 'finish']
middle_holds = df_holds[df_holds['role_type'] == 'middle']
# =====================
# 1. ANGLE
# =====================
features['angle'] = angle
# =====================
# 2. BASIC COUNTS
# =====================
features['total_holds'] = len(df_holds)
features['hand_holds'] = len(hand_holds)
features['foot_holds'] = len(foot_holds)
features['start_holds'] = len(start_holds)
features['finish_holds'] = len(finish_holds)
features['middle_holds'] = len(middle_holds)
# =====================
# 3. MATCHING FEATURE
# =====================
# A climb is "matching" if you are allowed to match your hands at any hold.
# There are slight difference in difficulties of matchines vs no matching climbs as per our analysis in 01.
features['is_nomatch'] = int((row['is_nomatch'] == 1) or bool(re.search(r'\bno\s*match(ing)?\b', row['description'], flags=re.IGNORECASE)))
# =====================
# 4. SPATIAL/POSITION
# =====================
xs = df_holds['x'].values
ys = df_holds['y'].values
features['mean_x'] = np.mean(xs)
features['mean_y'] = np.mean(ys)
features['std_x'] = np.std(xs) if len(xs) > 1 else 0
features['std_y'] = np.std(ys) if len(ys) > 1 else 0
features['range_x'] = np.max(xs) - np.min(xs)
features['range_y'] = np.max(ys) - np.min(ys)
features['min_y'] = np.min(ys)
features['max_y'] = np.max(ys)
# =====================
# 5. HEIGHT FEATURES
# =====================
if len(start_holds) > 0:
features['start_height'] = start_holds['y'].mean()
features['start_height_min'] = start_holds['y'].min()
features['start_height_max'] = start_holds['y'].max()
else:
features['start_height'] = np.nan
features['start_height_min'] = np.nan
features['start_height_max'] = np.nan
if len(finish_holds) > 0:
features['finish_height'] = finish_holds['y'].mean()
features['finish_height_min'] = finish_holds['y'].min()
features['finish_height_max'] = finish_holds['y'].max()
else:
features['finish_height'] = np.nan
features['finish_height_min'] = np.nan
features['finish_height_max'] = np.nan
features['height_gained'] = features['max_y'] - features['min_y']
if pd.notna(features.get('finish_height')) and pd.notna(features.get('start_height')):
features['height_gained_start_finish'] = features['finish_height'] - features['start_height']
else:
features['height_gained_start_finish'] = np.nan
# =====================
# 6. BBOX FEATURES
# =====================
bbox_width = features['range_x']
bbox_height = features['range_y']
features['bbox_area'] = bbox_width * bbox_height
features['bbox_aspect_ratio'] = bbox_width / bbox_height if bbox_height > 0 else 0
features['bbox_normalized_area'] = features['bbox_area'] / (board_width * board_height)
# =====================
# 7. HOLD DENSITY
# =====================
if features['bbox_area'] > 0:
features['hold_density'] = features['total_holds'] / features['bbox_area']
else:
features['hold_density'] = 0
features['holds_per_vertical_foot'] = features['total_holds'] / max(features['range_y'], 1)
# =====================
# 8. SYMMETRY/BALANCE
# =====================
center_x = (x_min + x_max) / 2
features['left_holds'] = (df_holds['x'] < center_x).sum()
features['right_holds'] = (df_holds['x'] >= center_x).sum()
features['left_ratio'] = features['left_holds'] / features['total_holds'] if features['total_holds'] > 0 else 0.5
# Symmetry score (how balanced left/right)
features['symmetry_score'] = 1 - abs(features['left_ratio'] - 0.5) * 2
# Hand symmetry
if len(hand_holds) > 0:
hand_left = (hand_holds['x'] < center_x).sum()
hand_right = (hand_holds['x'] >= center_x).sum()
features['hand_left_ratio'] = hand_left / len(hand_holds)
features['hand_symmetry'] = 1 - abs(features['hand_left_ratio'] - 0.5) * 2
else:
features['hand_left_ratio'] = np.nan
features['hand_symmetry'] = np.nan
# =====================
# 9. VERTICAL DISTRIBUTION
# =====================
y_median = np.median(ys)
features['upper_holds'] = (df_holds['y'] > y_median).sum()
features['lower_holds'] = (df_holds['y'] <= y_median).sum()
features['upper_ratio'] = features['upper_holds'] / features['total_holds']
# =====================
# 10. HAND REACH / SPREAD
# =====================
if len(hand_holds) >= 2:
hand_xs = hand_holds['x'].values
hand_ys = hand_holds['y'].values
hand_distances = []
for i in range(len(hand_holds)):
for j in range(i + 1, len(hand_holds)):
dx = hand_xs[i] - hand_xs[j]
dy = hand_ys[i] - hand_ys[j]
hand_distances.append(np.sqrt(dx**2 + dy**2))
features['max_hand_reach'] = max(hand_distances)
features['min_hand_reach'] = min(hand_distances)
features['mean_hand_reach'] = np.mean(hand_distances)
features['std_hand_reach'] = np.std(hand_distances)
features['hand_spread_x'] = hand_xs.max() - hand_xs.min()
features['hand_spread_y'] = hand_ys.max() - hand_ys.min()
else:
features['max_hand_reach'] = 0
features['min_hand_reach'] = 0
features['mean_hand_reach'] = 0
features['std_hand_reach'] = 0
features['hand_spread_x'] = 0
features['hand_spread_y'] = 0
# =====================
# 11. FOOT SPREAD
# =====================
if len(foot_holds) >= 2:
foot_xs = foot_holds['x'].values
foot_ys = foot_holds['y'].values
foot_distances = []
for i in range(len(foot_holds)):
for j in range(i + 1, len(foot_holds)):
dx = foot_xs[i] - foot_xs[j]
dy = foot_ys[i] - foot_ys[j]
foot_distances.append(np.sqrt(dx**2 + dy**2))
features['max_foot_spread'] = max(foot_distances)
features['mean_foot_spread'] = np.mean(foot_distances)
features['foot_spread_x'] = foot_xs.max() - foot_xs.min()
features['foot_spread_y'] = foot_ys.max() - foot_ys.min()
else:
features['max_foot_spread'] = 0
features['mean_foot_spread'] = 0
features['foot_spread_x'] = 0
features['foot_spread_y'] = 0
# =====================
# 12. HAND-TO-FOOT DISTANCES
# =====================
if len(hand_holds) > 0 and len(foot_holds) > 0:
h2f_distances = []
for _, h in hand_holds.iterrows():
for _, f in foot_holds.iterrows():
dx = h['x'] - f['x']
dy = h['y'] - f['y']
h2f_distances.append(np.sqrt(dx**2 + dy**2))
features['max_hand_to_foot'] = max(h2f_distances)
features['min_hand_to_foot'] = min(h2f_distances)
features['mean_hand_to_foot'] = np.mean(h2f_distances)
features['std_hand_to_foot'] = np.std(h2f_distances)
else:
features['max_hand_to_foot'] = 0
features['min_hand_to_foot'] = 0
features['mean_hand_to_foot'] = 0
features['std_hand_to_foot'] = 0
# =====================
# 13. HOLD DIFFICULTY FEATURES
# =====================
difficulties = df_holds['difficulty'].dropna().values
if len(difficulties) > 0:
features['mean_hold_difficulty'] = np.mean(difficulties)
features['max_hold_difficulty'] = np.max(difficulties)
features['min_hold_difficulty'] = np.min(difficulties)
features['std_hold_difficulty'] = np.std(difficulties)
features['median_hold_difficulty'] = np.median(difficulties)
features['difficulty_range'] = features['max_hold_difficulty'] - features['min_hold_difficulty']
else:
features['mean_hold_difficulty'] = np.nan
features['max_hold_difficulty'] = np.nan
features['min_hold_difficulty'] = np.nan
features['std_hold_difficulty'] = np.nan
features['median_hold_difficulty'] = np.nan
features['difficulty_range'] = np.nan
# Hand difficulty
hand_diffs = hand_holds['difficulty'].dropna().values if len(hand_holds) > 0 else np.array([])
if len(hand_diffs) > 0:
features['mean_hand_difficulty'] = np.mean(hand_diffs)
features['max_hand_difficulty'] = np.max(hand_diffs)
features['std_hand_difficulty'] = np.std(hand_diffs)
else:
features['mean_hand_difficulty'] = np.nan
features['max_hand_difficulty'] = np.nan
features['std_hand_difficulty'] = np.nan
# Foot difficulty
foot_diffs = foot_holds['difficulty'].dropna().values if len(foot_holds) > 0 else np.array([])
if len(foot_diffs) > 0:
features['mean_foot_difficulty'] = np.mean(foot_diffs)
features['max_foot_difficulty'] = np.max(foot_diffs)
features['std_foot_difficulty'] = np.std(foot_diffs)
else:
features['mean_foot_difficulty'] = np.nan
features['max_foot_difficulty'] = np.nan
features['std_foot_difficulty'] = np.nan
# Start/Finish difficulty
start_diffs = start_holds['difficulty'].dropna().values if len(start_holds) > 0 else np.array([])
finish_diffs = finish_holds['difficulty'].dropna().values if len(finish_holds) > 0 else np.array([])
features['start_difficulty'] = np.mean(start_diffs) if len(start_diffs) > 0 else np.nan
features['finish_difficulty'] = np.mean(finish_diffs) if len(finish_diffs) > 0 else np.nan
# =====================
# 14. ADDITIONAL BASIC FEATURES
# =====================
# Hand to foot ratio
features['hand_foot_ratio'] = features['hand_holds'] / max(features['foot_holds'], 1)
# Movement complexity
features['movement_density'] = features['total_holds'] / max(features['height_gained'], 1)
# Center of mass of holds
features['hold_com_x'] = np.average(xs, weights=None)
features['hold_com_y'] = np.average(ys, weights=None)
# Weighted difficulty (by y position)
if len(difficulties) > 0 and len(ys) >= len(difficulties):
weights = (ys[:len(difficulties)] - ys.min()) / max(ys.max() - ys.min(), 1) + 0.5
features['weighted_difficulty'] = np.average(difficulties, weights=weights)
else:
features['weighted_difficulty'] = features['mean_hold_difficulty']
# =====================================================
# 15. GEOMETRIC FEATURES
# =====================================================
# Convex hull area (2D polygon enclosing all holds)
if len(df_holds) >= 3:
try:
points = np.column_stack([xs, ys])
hull = ConvexHull(points)
features['convex_hull_area'] = hull.volume # In 2D, volume = area
features['convex_hull_perimeter'] = hull.area # In 2D, area = perimeter
features['hull_area_to_bbox_ratio'] = features['convex_hull_area'] / max(features['bbox_area'], 1)
except:
features['convex_hull_area'] = np.nan
features['convex_hull_perimeter'] = np.nan
features['hull_area_to_bbox_ratio'] = np.nan
else:
features['convex_hull_area'] = 0
features['convex_hull_perimeter'] = 0
features['hull_area_to_bbox_ratio'] = 0
# Nearest neighbor distances
if len(df_holds) >= 2:
points = np.column_stack([xs, ys])
distances = pdist(points)
features['min_nn_distance'] = np.min(distances)
features['mean_nn_distance'] = np.mean(distances)
features['max_nn_distance'] = np.max(distances)
features['std_nn_distance'] = np.std(distances)
else:
features['min_nn_distance'] = 0
features['mean_nn_distance'] = 0
features['max_nn_distance'] = 0
features['std_nn_distance'] = 0
# Clustering coefficient (holds grouped vs spread)
if len(df_holds) >= 3:
points = np.column_stack([xs, ys])
dist_matrix = squareform(pdist(points))
# Count neighbors within threshold (e.g., 12 inches)
threshold = 12.0
neighbors_count = (dist_matrix < threshold).sum(axis=1) - 1 # Exclude self
features['mean_neighbors_12in'] = np.mean(neighbors_count)
features['max_neighbors_12in'] = np.max(neighbors_count)
# Clustering: ratio of actual neighbors to max possible
avg_neighbors = np.mean(neighbors_count)
max_possible = len(df_holds) - 1
features['clustering_ratio'] = avg_neighbors / max_possible if max_possible > 0 else 0
else:
features['mean_neighbors_12in'] = 0
features['max_neighbors_12in'] = 0
features['clustering_ratio'] = 0
# Path length (greedy nearest-neighbor tour)
if len(df_holds) >= 2:
# Sort by y (bottom to top) for approximate path
sorted_indices = np.argsort(ys)
sorted_points = np.column_stack([xs[sorted_indices], ys[sorted_indices]])
path_length = 0
for i in range(len(sorted_points) - 1):
dx = sorted_points[i+1, 0] - sorted_points[i, 0]
dy = sorted_points[i+1, 1] - sorted_points[i, 1]
path_length += np.sqrt(dx**2 + dy**2)
features['path_length_vertical'] = path_length
features['path_efficiency'] = features['height_gained'] / max(path_length, 1)
else:
features['path_length_vertical'] = 0
features['path_efficiency'] = 0
# =====================================================
# 16. DIFFICULTY-WEIGHTED FEATURES
# =====================================================
# Difficulty gradient (finish vs start)
if pd.notna(features.get('finish_difficulty')) and pd.notna(features.get('start_difficulty')):
features['difficulty_gradient'] = features['finish_difficulty'] - features['start_difficulty']
else:
features['difficulty_gradient'] = np.nan
# Difficulty variance by vertical region (split into thirds)
if len(difficulties) > 0:
y_min_val, y_max_val = ys.min(), ys.max()
y_range = y_max_val - y_min_val
if y_range > 0:
# Split into lower, middle, upper thirds
lower_mask = ys <= (y_min_val + y_range / 3)
middle_mask = (ys > y_min_val + y_range / 3) & (ys <= y_min_val + 2 * y_range / 3)
upper_mask = ys > (y_min_val + 2 * y_range / 3)
# Get difficulties for each region
df_with_diff = df_holds.copy()
df_with_diff['lower'] = lower_mask
df_with_diff['middle'] = middle_mask
df_with_diff['upper'] = upper_mask
lower_diffs = df_with_diff[df_with_diff['lower'] & df_with_diff['difficulty'].notna()]['difficulty']
middle_diffs = df_with_diff[df_with_diff['middle'] & df_with_diff['difficulty'].notna()]['difficulty']
upper_diffs = df_with_diff[df_with_diff['upper'] & df_with_diff['difficulty'].notna()]['difficulty']
features['lower_region_difficulty'] = lower_diffs.mean() if len(lower_diffs) > 0 else np.nan
features['middle_region_difficulty'] = middle_diffs.mean() if len(middle_diffs) > 0 else np.nan
features['upper_region_difficulty'] = upper_diffs.mean() if len(upper_diffs) > 0 else np.nan
# Difficulty progression (upper - lower)
if pd.notna(features['lower_region_difficulty']) and pd.notna(features['upper_region_difficulty']):
features['difficulty_progression'] = features['upper_region_difficulty'] - features['lower_region_difficulty']
else:
features['difficulty_progression'] = np.nan
else:
features['lower_region_difficulty'] = features['mean_hold_difficulty']
features['middle_region_difficulty'] = features['mean_hold_difficulty']
features['upper_region_difficulty'] = features['mean_hold_difficulty']
features['difficulty_progression'] = 0
else:
features['lower_region_difficulty'] = np.nan
features['middle_region_difficulty'] = np.nan
features['upper_region_difficulty'] = np.nan
features['difficulty_progression'] = np.nan
# Hardest move estimate (max difficulty jump between consecutive holds)
if len(hand_holds) >= 2 and len(hand_diffs) >= 2:
# Sort hand holds by y position
hand_sorted = hand_holds.sort_values('y')
hand_diff_sorted = hand_sorted['difficulty'].dropna().values
if len(hand_diff_sorted) >= 2:
difficulty_jumps = np.abs(np.diff(hand_diff_sorted))
features['max_difficulty_jump'] = np.max(difficulty_jumps) if len(difficulty_jumps) > 0 else 0
features['mean_difficulty_jump'] = np.mean(difficulty_jumps) if len(difficulty_jumps) > 0 else 0
else:
features['max_difficulty_jump'] = 0
features['mean_difficulty_jump'] = 0
else:
features['max_difficulty_jump'] = 0
features['mean_difficulty_jump'] = 0
# Difficulty-weighted reach (combine difficulty with distances)
if len(hand_holds) >= 2 and len(hand_diffs) >= 2:
hand_sorted = hand_holds.sort_values('y')
xs_sorted = hand_sorted['x'].values
ys_sorted = hand_sorted['y'].values
diffs_sorted = hand_sorted['difficulty'].fillna(hand_diffs.mean()).values
weighted_reach = []
for i in range(len(hand_sorted) - 1):
dx = xs_sorted[i+1] - xs_sorted[i]
dy = ys_sorted[i+1] - ys_sorted[i]
dist = np.sqrt(dx**2 + dy**2)
avg_diff = (diffs_sorted[i] + diffs_sorted[i+1]) / 2
weighted_reach.append(dist * avg_diff)
features['difficulty_weighted_reach'] = np.mean(weighted_reach) if weighted_reach else 0
features['max_weighted_reach'] = np.max(weighted_reach) if weighted_reach else 0
else:
features['difficulty_weighted_reach'] = 0
features['max_weighted_reach'] = 0
# =====================================================
# 17. POSITION-NORMALIZED FEATURES
# =====================================================
# Normalized positions (0-1 scale)
features['mean_x_normalized'] = (features['mean_x'] - x_min) / board_width
features['mean_y_normalized'] = (features['mean_y'] - y_min) / board_height
features['std_x_normalized'] = features['std_x'] / board_width
features['std_y_normalized'] = features['std_y'] / board_height
# Start/finish normalized
if pd.notna(features.get('start_height')):
features['start_height_normalized'] = (features['start_height'] - y_min) / board_height
else:
features['start_height_normalized'] = np.nan
if pd.notna(features.get('finish_height')):
features['finish_height_normalized'] = (features['finish_height'] - y_min) / board_height
else:
features['finish_height_normalized'] = np.nan
# Distance from typical positions (center bottom for start, center top for finish)
typical_start_y = y_min + board_height * 0.15
typical_finish_y = y_min + board_height * 0.85
if pd.notna(features.get('start_height')):
features['start_offset_from_typical'] = abs(features['start_height'] - typical_start_y)
else:
features['start_offset_from_typical'] = np.nan
if pd.notna(features.get('finish_height')):
features['finish_offset_from_typical'] = abs(features['finish_height'] - typical_finish_y)
else:
features['finish_offset_from_typical'] = np.nan
# Hold positions relative to start
if len(start_holds) > 0:
start_y = start_holds['y'].mean()
features['mean_y_relative_to_start'] = features['mean_y'] - start_y
features['max_y_relative_to_start'] = features['max_y'] - start_y
else:
features['mean_y_relative_to_start'] = np.nan
features['max_y_relative_to_start'] = np.nan
# Spread normalized by board
features['spread_x_normalized'] = features['range_x'] / board_width
features['spread_y_normalized'] = features['range_y'] / board_height
# Bbox coverage (percentage of board covered)
features['bbox_coverage_x'] = features['range_x'] / board_width
features['bbox_coverage_y'] = features['range_y'] / board_height
# Position quartile features
y_quartiles = np.percentile(ys, [25, 50, 75])
features['y_q25'] = y_quartiles[0]
features['y_q50'] = y_quartiles[1]
features['y_q75'] = y_quartiles[2]
features['y_iqr'] = y_quartiles[2] - y_quartiles[0]
# Holds in each vertical quartile
features['holds_bottom_quartile'] = (ys < y_quartiles[0]).sum()
features['holds_top_quartile'] = (ys >= y_quartiles[2]).sum()
return features
Sanity Check on One Example¶
Before extracting features for the entire dataset, we inspect one representative climb to confirm that the parsing logic and the computed geometric summaries behave as expected. Let's do the climb "Ooo La La" from notebook two.

extract_features(df_climbs.iloc[10000], placement_coords, df_placements)
The printed example above is an important checkpoint. If the parsed placements, role counts, or geometric summaries look unreasonable here, then the full feature matrix will inherit those mistakes.
Extract Features or all climbs¶
"""
==================================
Extract features for all climbs
==================================
"""
from tqdm import tqdm # Progess bar. This will take a while.
print(f"Extracting features for {len(df_climbs)} climbs...")
feature_list = []
for idx, row in tqdm(df_climbs.iterrows(), total=len(df_climbs)):
features = extract_features(row, placement_coords, df_hold_difficulty)
if features:
features['climb_uuid'] = row['uuid']
features['display_difficulty'] = row['display_difficulty']
feature_list.append(features)
df_features = pd.DataFrame(feature_list)
df_features = df_features.set_index('climb_uuid')
print(f"\nExtracted features for {len(df_features)} climbs")
print(f"Feature columns: {len(df_features.columns)}")
print("\n### Feature Table Sample\n")
display(df_features.head(10))
"""
==================================
Feature Summary Statistics
==================================
"""
print("### Feature Summary\n")
summary = df_features.describe().T
summary['missing'] = df_features.isna().sum()
summary['missing_pct'] = (df_features.isna().sum() / len(df_features) * 100).round(2)
display(summary[['count', 'mean', 'std', 'min', 'max', 'missing', 'missing_pct']])
Correlation with Difficulty¶
"""
==================================
Correlation with Difficulty
==================================
"""
correlations = df_features.corr()['display_difficulty'].drop('display_difficulty').sort_values(key=abs, ascending=False)
print("### Top 30 Features Correlated with Difficulty\n")
display(correlations.head(30).to_frame('correlation'))
print("\n### Bottom 10 Features (Least Correlated)\n")
display(correlations.tail(10).to_frame('correlation'))
Visualizing Key Features¶
"""
==================================
Visualize Key Features
==================================
"""
import matplotlib.pyplot as plt
import seaborn as sns
fig, axes = plt.subplots(4, 4, figsize=(16, 16))
key_features = [
'angle',
'total_holds',
'height_gained',
'mean_hold_difficulty',
'max_hold_difficulty',
'mean_hand_reach',
'hold_density',
'symmetry_score',
'is_nomatch',
'convex_hull_area',
'difficulty_progression',
'mean_y_normalized',
'clustering_ratio',
'path_efficiency',
'max_difficulty_jump',
'difficulty_weighted_reach'
]
for ax, feature in zip(axes.flat, key_features):
if feature in df_features.columns:
ax.scatter(df_features[feature], df_features['display_difficulty'], alpha=0.3, s=10)
ax.set_xlabel(feature)
ax.set_ylabel('Difficulty')
ax.set_title(f'{feature} vs Difficulty')
plt.tight_layout()
plt.savefig('../images/04_climb_features/feature_correlations.png', dpi=150, bbox_inches='tight')
plt.show()
"""
==================================
Add Interaction Features
==================================
"""
# Angle interactions
df_features['angle_x_holds'] = df_features['angle'] * df_features['total_holds']
df_features['angle_x_difficulty'] = df_features['angle'] * df_features['mean_hold_difficulty'].fillna(0)
df_features['angle_squared'] = df_features['angle'] ** 2
# Difficulty interactions
df_features['difficulty_x_height'] = df_features['mean_hold_difficulty'].fillna(0) * df_features['height_gained']
df_features['difficulty_x_density'] = df_features['mean_hold_difficulty'].fillna(0) * df_features['hold_density']
# Complexity features
df_features['complexity_score'] = (
df_features['total_holds'] *
df_features['mean_hand_reach'].fillna(0) *
df_features['hold_density']
)
# Geometric × difficulty
df_features['hull_area_x_difficulty'] = df_features['convex_hull_area'].fillna(0) * df_features['mean_hold_difficulty'].fillna(0)
print(f"Added interaction features. Total columns: {len(df_features.columns)}")
"""
==================================
Handle Missing Values
==================================
"""
missing = df_features.isna().sum()
missing_cols = missing[missing > 0]
print("### Columns with Missing Values\n")
display(missing_cols.to_frame('missing'))
# Fill difficulty NaNs with column mean
difficulty_cols = [c for c in df_features.columns if 'difficulty' in c.lower()]
for col in difficulty_cols:
if df_features[col].isna().any():
df_features[col] = df_features[col].fillna(df_features[col].mean())
# Fill start/finish height with min_y/max_y if missing
df_features['start_height'] = df_features['start_height'].fillna(df_features['min_y'])
df_features['finish_height'] = df_features['finish_height'].fillna(df_features['max_y'])
# Fill normalized features
df_features['start_height_normalized'] = df_features['start_height_normalized'].fillna(
(df_features['start_height'] - y_min) / board_height
)
df_features['finish_height_normalized'] = df_features['finish_height_normalized'].fillna(
(df_features['finish_height'] - y_min) / board_height
)
# Fill other NaNs with column means
for col in df_features.columns:
if df_features[col].isna().any():
if df_features[col].dtype in ['float64', 'int64']:
df_features[col] = df_features[col].fillna(df_features[col].mean())
# Check remaining missing
remaining = df_features.isna().sum().sum()
print(f"\nRemaining missing values: {remaining}")
"""
===================================
Feature Importance Review
===================================
"""
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
X = df_features.drop(columns=['display_difficulty'])
y = df_features['display_difficulty']
rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=3, n_jobs=-1)
rf.fit(X, y)
importance = pd.DataFrame({
'feature': X.columns,
'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)
print("### Top 30 Most Important Features (Random Forest)\n")
display(importance.head(30))
# Cross-validation score
scores = cross_val_score(rf, X, y, cv=5, scoring='neg_mean_absolute_error')
print(f"\nCross-validated MAE: {-scores.mean():.2f} (+/- {scores.std():.2f})")
Conclusion¶
"""
============================
Save Feature Matrix
============================
"""
raw_cols = [c for c in df_features.columns if c.endswith('_raw')]
if raw_cols:
print("Dropping raw columns from final climb feature matrix:")
print(raw_cols)
df_features = df_features.drop(columns=raw_cols)
# `climb_features.csv` is the canonical name used by later notebooks.
df_features.to_csv('../data/04_climb_features/climb_features.csv')
print("Saved feature matrix to:")
print(" - ../data/04_climb_features/climb_features.csv")
with open('../data/04_climb_features/feature_list.txt', 'w') as f:
for col in df_features.columns:
f.write(f"{col}\n")
print("\nFeature list saved to ../data/04_climb_features/feature_list.txt")
"""
==================================
Final Feature Summary
==================================
"""
print("### Feature Engineering Complete\n")
print(f"Total climbs: {len(df_features)}")
print(f"Total features: {df_features.shape[1] - 1}") # Exclude target
print(f"Target: display_difficulty")
print(f"Feature matrix shape: {df_features.shape}")
print("""\nInterpretation:
- Each row is a climb-angle observation.
- The target is `display_difficulty`.
- The predictors combine geometry, hold statistics, and aggregate difficulty information.
- Hold-difficulty-based features use Bayesian-smoothed hold scores from Notebook 03.
- The next notebook tests how much predictive signal these engineered features actually contain.
""")