"""
==================================
Setup and Imports
==================================
"""

# Imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib.patches as mpatches

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

from scipy.spatial import ConvexHull
from scipy.spatial.distance import pdist, squareform

import sqlite3

import re
import os
from collections import defaultdict

import ast

from PIL import Image

# Set some display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Set style
palette=['steelblue', 'coral', 'seagreen']  #(for multi-bar graphs)

# Set board image for some visual analysis
board_img = Image.open('../images/tb2_board_12x12_composite.png')

# Connect to the database
DB_PATH="../data/tb2.db"
conn = sqlite3.connect(DB_PATH)

# Create output directories
os.makedirs('../data/04_climb_features', exist_ok=True)
os.makedirs('../images/04_climb_features', exist_ok=True)

"""
==================================
Query our data from the DB
==================================

This time we restrict to where `layout_id=10` for the TB2 Mirror.
We will also restrict ourselves to an angle of at most 50, since according to our grade vs angle distribution in notebook 01, things start to look a bit weird past 50.
(Probably a bias towards climbers who can actually climb that steep). We will encode this directly into our query.
"""

# Query climbs data
climbs_query = """
SELECT
    c.uuid,
    c.name AS climb_name,
    c.setter_username,
    c.layout_id AS layout_id,
    c.description,
    c.is_nomatch,
    c.is_listed,
    l.name AS layout_name,
    p.name AS board_name,
    c.frames,
    cs.angle,
    cs.display_difficulty,
    dg.boulder_name AS boulder_grade,
    cs.ascensionist_count,
    cs.quality_average,
    cs.fa_at
FROM climbs c
JOIN layouts l ON c.layout_id = l.id
JOIN products p ON l.product_id = p.id
JOIN climb_stats cs ON c.uuid = cs.climb_uuid
JOIN difficulty_grades dg ON ROUND(cs.display_difficulty) = dg.difficulty
WHERE cs.display_difficulty IS NOT NULL AND c.is_listed=1 AND c.layout_id=10 AND cs.angle <= 50
"""

# Query information about placements (and their mirrors)
placements_query = """
SELECT
    p.id AS placement_id,
    h.x,
    h.y,
    p.default_placement_role_id AS default_role_id,
    p.set_id AS set_id,
    s.name AS set_name,
    p_mirror.id AS mirror_placement_id
FROM placements p
JOIN holes h ON p.hole_id = h.id
JOIN sets s ON p.set_id = s.id
LEFT JOIN holes h_mirror ON h.mirrored_hole_id = h_mirror.id
LEFT JOIN placements p_mirror ON p_mirror.hole_id = h_mirror.id AND p_mirror.layout_id = p.layout_id
WHERE p.layout_id = 10
"""

# Load it into a DataFrame
df_climbs = pd.read_sql_query(climbs_query, conn)
df_placements = pd.read_sql_query(placements_query, conn)

# Load the hold-level difficulty table created in notebook 03
df_hold_difficulty = pd.read_csv('../data/03_hold_difficulty/hold_difficulty_scores.csv', index_col='placement_id')

print("Difficulty-related columns loaded from Notebook 03:")
print([c for c in df_hold_difficulty.columns if 'difficulty' in c.lower()])

assert 'overall_difficulty' in df_hold_difficulty.columns, "Missing overall_difficulty"

df_hold_difficulty

placement_coords = {
    row['placement_id']: (row['x'], row['y'])
    for _, row in df_placements.iterrows()
}

board_width = 144
board_height = 144

x_min, x_max = -68, 68
y_min, y_max = 0, 144

# Role definitions (TB2)
ROLE_DEFINITIONS = {
    'start': 5,
    'middle': 6,
    'finish': 7,
    'foot': 8
}

HAND_ROLE_IDS = [5, 6, 7]
FOOT_ROLE_IDS = [8]

"""
==================================
Parse Frame function
==================================
"""

def parse_frames(frames_str):
    """
    Parse frames string into list of (placement_id, role_id) tuples.
    
    Parameters:
    -----------
    frames_str : str
        Frame string like "p1r5p2r6p3r8"
    
    Returns:
    --------
    list of tuples: [(placement_id, role_id), ...]
    """
    if not isinstance(frames_str, str):
        return []
    
    matches = re.findall(r'p(\d+)r(\d+)', frames_str)
    return [(int(p), int(r)) for p, r in matches]


def get_role_type(role_id):
    """Map role_id to role type string."""
    for role_type, rid in ROLE_DEFINITIONS.items():
        if role_id == rid:
            return role_type
    return 'unknown'


# Test
test_frames = "p1r5p2r6p3r8p4r5"
parsed = parse_frames(test_frames)
print(f"Test parse: {parsed}")

"""
==================================
Feature Exraction Function
==================================
"""

def extract_features(row, placement_coords, df_hold_difficulty):
    """
    Extract all features from a single climb row.
    """
    features = {}
    
    # Parse frames
    holds = parse_frames(row['frames'])
    angle = row['angle']
    
    if not holds:
        return None
    
    # =====================
    # BASIC HOLD EXTRACTION
    # =====================
    
    hold_data = []
    for placement_id, role_id in holds:
        coords = placement_coords.get(placement_id, (None, None))
        if coords[0] is None:
            continue
        
        role_type = get_role_type(role_id)
        is_hand = role_id in HAND_ROLE_IDS
        is_foot = role_id in FOOT_ROLE_IDS
        
        # Get difficulty scores for this hold at this angle
        diff_key = f"{role_type}_diff_{int(angle)}deg"
        hand_diff_key = f"hand_diff_{int(angle)}deg"
        foot_diff_key = f"foot_diff_{int(angle)}deg"
        
        difficulty = None
        if placement_id in df_hold_difficulty.index:
            # Try role-specific first, then aggregate
            if diff_key in df_hold_difficulty.columns:
                difficulty = df_hold_difficulty.loc[placement_id, diff_key]
            if pd.isna(difficulty):
                if is_hand and hand_diff_key in df_hold_difficulty.columns:
                    difficulty = df_hold_difficulty.loc[placement_id, hand_diff_key]
                elif is_foot and foot_diff_key in df_hold_difficulty.columns:
                    difficulty = df_hold_difficulty.loc[placement_id, foot_diff_key]
            
            # Fallback to overall
            if pd.isna(difficulty) and 'overall_difficulty' in df_hold_difficulty.columns:
                difficulty = df_hold_difficulty.loc[placement_id, 'overall_difficulty']
        
        hold_data.append({
            'placement_id': placement_id,
            'x': coords[0],
            'y': coords[1],
            'role_id': role_id,
            'role_type': role_type,
            'is_hand': is_hand,
            'is_foot': is_foot,
            'difficulty': difficulty
        })
    
    if not hold_data:
        return None
    
    df_holds = pd.DataFrame(hold_data)
    
    # Separate by role
    hand_holds = df_holds[df_holds['is_hand']]
    foot_holds = df_holds[df_holds['is_foot']]
    start_holds = df_holds[df_holds['role_type'] == 'start']
    finish_holds = df_holds[df_holds['role_type'] == 'finish']
    middle_holds = df_holds[df_holds['role_type'] == 'middle']
    
    # =====================
    # 1. ANGLE
    # =====================
    features['angle'] = angle
    
    # =====================
    # 2. BASIC COUNTS
    # =====================
    features['total_holds'] = len(df_holds)
    features['hand_holds'] = len(hand_holds)
    features['foot_holds'] = len(foot_holds)
    features['start_holds'] = len(start_holds)
    features['finish_holds'] = len(finish_holds)
    features['middle_holds'] = len(middle_holds)
    
    # =====================
    # 3. MATCHING FEATURE
    # =====================
    # A climb is "matching" if you are allowed to match your hands at any hold.
    # There are slight difference in difficulties of matchines vs no matching climbs as per our analysis in 01.
    features['is_nomatch'] = int((row['is_nomatch'] == 1) or bool(re.search(r'\bno\s*match(ing)?\b', row['description'], flags=re.IGNORECASE)))
    
    # =====================
    # 4. SPATIAL/POSITION
    # =====================
    xs = df_holds['x'].values
    ys = df_holds['y'].values
    
    features['mean_x'] = np.mean(xs)
    features['mean_y'] = np.mean(ys)
    features['std_x'] = np.std(xs) if len(xs) > 1 else 0
    features['std_y'] = np.std(ys) if len(ys) > 1 else 0
    features['range_x'] = np.max(xs) - np.min(xs)
    features['range_y'] = np.max(ys) - np.min(ys)
    features['min_y'] = np.min(ys)
    features['max_y'] = np.max(ys)
    
    # =====================
    # 5. HEIGHT FEATURES
    # =====================
    if len(start_holds) > 0:
        features['start_height'] = start_holds['y'].mean()
        features['start_height_min'] = start_holds['y'].min()
        features['start_height_max'] = start_holds['y'].max()
    else:
        features['start_height'] = np.nan
        features['start_height_min'] = np.nan
        features['start_height_max'] = np.nan
    
    if len(finish_holds) > 0:
        features['finish_height'] = finish_holds['y'].mean()
        features['finish_height_min'] = finish_holds['y'].min()
        features['finish_height_max'] = finish_holds['y'].max()
    else:
        features['finish_height'] = np.nan
        features['finish_height_min'] = np.nan
        features['finish_height_max'] = np.nan
    
    features['height_gained'] = features['max_y'] - features['min_y']
    
    if pd.notna(features.get('finish_height')) and pd.notna(features.get('start_height')):
        features['height_gained_start_finish'] = features['finish_height'] - features['start_height']
    else:
        features['height_gained_start_finish'] = np.nan
    
    # =====================
    # 6. BBOX FEATURES
    # =====================
    bbox_width = features['range_x']
    bbox_height = features['range_y']
    features['bbox_area'] = bbox_width * bbox_height
    features['bbox_aspect_ratio'] = bbox_width / bbox_height if bbox_height > 0 else 0
    features['bbox_normalized_area'] = features['bbox_area'] / (board_width * board_height)
    
    # =====================
    # 7. HOLD DENSITY
    # =====================
    if features['bbox_area'] > 0:
        features['hold_density'] = features['total_holds'] / features['bbox_area']
    else:
        features['hold_density'] = 0
    
    features['holds_per_vertical_foot'] = features['total_holds'] / max(features['range_y'], 1)
    
    # =====================
    # 8. SYMMETRY/BALANCE
    # =====================
    center_x = (x_min + x_max) / 2
    features['left_holds'] = (df_holds['x'] < center_x).sum()
    features['right_holds'] = (df_holds['x'] >= center_x).sum()
    features['left_ratio'] = features['left_holds'] / features['total_holds'] if features['total_holds'] > 0 else 0.5
    
    # Symmetry score (how balanced left/right)
    features['symmetry_score'] = 1 - abs(features['left_ratio'] - 0.5) * 2
    
    # Hand symmetry
    if len(hand_holds) > 0:
        hand_left = (hand_holds['x'] < center_x).sum()
        hand_right = (hand_holds['x'] >= center_x).sum()
        features['hand_left_ratio'] = hand_left / len(hand_holds)
        features['hand_symmetry'] = 1 - abs(features['hand_left_ratio'] - 0.5) * 2
    else:
        features['hand_left_ratio'] = np.nan
        features['hand_symmetry'] = np.nan
    
    # =====================
    # 9. VERTICAL DISTRIBUTION
    # =====================
    y_median = np.median(ys)
    features['upper_holds'] = (df_holds['y'] > y_median).sum()
    features['lower_holds'] = (df_holds['y'] <= y_median).sum()
    features['upper_ratio'] = features['upper_holds'] / features['total_holds']
    
    # =====================
    # 10. HAND REACH / SPREAD
    # =====================
    if len(hand_holds) >= 2:
        hand_xs = hand_holds['x'].values
        hand_ys = hand_holds['y'].values
        
        hand_distances = []
        for i in range(len(hand_holds)):
            for j in range(i + 1, len(hand_holds)):
                dx = hand_xs[i] - hand_xs[j]
                dy = hand_ys[i] - hand_ys[j]
                hand_distances.append(np.sqrt(dx**2 + dy**2))
        
        features['max_hand_reach'] = max(hand_distances)
        features['min_hand_reach'] = min(hand_distances)
        features['mean_hand_reach'] = np.mean(hand_distances)
        features['std_hand_reach'] = np.std(hand_distances)
        features['hand_spread_x'] = hand_xs.max() - hand_xs.min()
        features['hand_spread_y'] = hand_ys.max() - hand_ys.min()
    else:
        features['max_hand_reach'] = 0
        features['min_hand_reach'] = 0
        features['mean_hand_reach'] = 0
        features['std_hand_reach'] = 0
        features['hand_spread_x'] = 0
        features['hand_spread_y'] = 0
    
    # =====================
    # 11. FOOT SPREAD
    # =====================
    if len(foot_holds) >= 2:
        foot_xs = foot_holds['x'].values
        foot_ys = foot_holds['y'].values
        
        foot_distances = []
        for i in range(len(foot_holds)):
            for j in range(i + 1, len(foot_holds)):
                dx = foot_xs[i] - foot_xs[j]
                dy = foot_ys[i] - foot_ys[j]
                foot_distances.append(np.sqrt(dx**2 + dy**2))
        
        features['max_foot_spread'] = max(foot_distances)
        features['mean_foot_spread'] = np.mean(foot_distances)
        features['foot_spread_x'] = foot_xs.max() - foot_xs.min()
        features['foot_spread_y'] = foot_ys.max() - foot_ys.min()
    else:
        features['max_foot_spread'] = 0
        features['mean_foot_spread'] = 0
        features['foot_spread_x'] = 0
        features['foot_spread_y'] = 0
    
    # =====================
    # 12. HAND-TO-FOOT DISTANCES
    # =====================
    if len(hand_holds) > 0 and len(foot_holds) > 0:
        h2f_distances = []
        for _, h in hand_holds.iterrows():
            for _, f in foot_holds.iterrows():
                dx = h['x'] - f['x']
                dy = h['y'] - f['y']
                h2f_distances.append(np.sqrt(dx**2 + dy**2))
        
        features['max_hand_to_foot'] = max(h2f_distances)
        features['min_hand_to_foot'] = min(h2f_distances)
        features['mean_hand_to_foot'] = np.mean(h2f_distances)
        features['std_hand_to_foot'] = np.std(h2f_distances)
    else:
        features['max_hand_to_foot'] = 0
        features['min_hand_to_foot'] = 0
        features['mean_hand_to_foot'] = 0
        features['std_hand_to_foot'] = 0
    
    # =====================
    # 13. HOLD DIFFICULTY FEATURES
    # =====================
    difficulties = df_holds['difficulty'].dropna().values
    
    if len(difficulties) > 0:
        features['mean_hold_difficulty'] = np.mean(difficulties)
        features['max_hold_difficulty'] = np.max(difficulties)
        features['min_hold_difficulty'] = np.min(difficulties)
        features['std_hold_difficulty'] = np.std(difficulties)
        features['median_hold_difficulty'] = np.median(difficulties)
        features['difficulty_range'] = features['max_hold_difficulty'] - features['min_hold_difficulty']
    else:
        features['mean_hold_difficulty'] = np.nan
        features['max_hold_difficulty'] = np.nan
        features['min_hold_difficulty'] = np.nan
        features['std_hold_difficulty'] = np.nan
        features['median_hold_difficulty'] = np.nan
        features['difficulty_range'] = np.nan
    
    # Hand difficulty
    hand_diffs = hand_holds['difficulty'].dropna().values if len(hand_holds) > 0 else np.array([])
    if len(hand_diffs) > 0:
        features['mean_hand_difficulty'] = np.mean(hand_diffs)
        features['max_hand_difficulty'] = np.max(hand_diffs)
        features['std_hand_difficulty'] = np.std(hand_diffs)
    else:
        features['mean_hand_difficulty'] = np.nan
        features['max_hand_difficulty'] = np.nan
        features['std_hand_difficulty'] = np.nan
    
    # Foot difficulty
    foot_diffs = foot_holds['difficulty'].dropna().values if len(foot_holds) > 0 else np.array([])
    if len(foot_diffs) > 0:
        features['mean_foot_difficulty'] = np.mean(foot_diffs)
        features['max_foot_difficulty'] = np.max(foot_diffs)
        features['std_foot_difficulty'] = np.std(foot_diffs)
    else:
        features['mean_foot_difficulty'] = np.nan
        features['max_foot_difficulty'] = np.nan
        features['std_foot_difficulty'] = np.nan
    
    # Start/Finish difficulty
    start_diffs = start_holds['difficulty'].dropna().values if len(start_holds) > 0 else np.array([])
    finish_diffs = finish_holds['difficulty'].dropna().values if len(finish_holds) > 0 else np.array([])
    
    features['start_difficulty'] = np.mean(start_diffs) if len(start_diffs) > 0 else np.nan
    features['finish_difficulty'] = np.mean(finish_diffs) if len(finish_diffs) > 0 else np.nan
    
    # =====================
    # 14. ADDITIONAL BASIC FEATURES
    # =====================
    
    # Hand to foot ratio
    features['hand_foot_ratio'] = features['hand_holds'] / max(features['foot_holds'], 1)
    
    # Movement complexity
    features['movement_density'] = features['total_holds'] / max(features['height_gained'], 1)
    
    # Center of mass of holds
    features['hold_com_x'] = np.average(xs, weights=None)
    features['hold_com_y'] = np.average(ys, weights=None)
    
    # Weighted difficulty (by y position)
    if len(difficulties) > 0 and len(ys) >= len(difficulties):
        weights = (ys[:len(difficulties)] - ys.min()) / max(ys.max() - ys.min(), 1) + 0.5
        features['weighted_difficulty'] = np.average(difficulties, weights=weights)
    else:
        features['weighted_difficulty'] = features['mean_hold_difficulty']
    
    # =====================================================
    # 15. GEOMETRIC FEATURES
    # =====================================================
    
    # Convex hull area (2D polygon enclosing all holds)
    if len(df_holds) >= 3:
        try:
            points = np.column_stack([xs, ys])
            hull = ConvexHull(points)
            features['convex_hull_area'] = hull.volume  # In 2D, volume = area
            features['convex_hull_perimeter'] = hull.area  # In 2D, area = perimeter
            features['hull_area_to_bbox_ratio'] = features['convex_hull_area'] / max(features['bbox_area'], 1)
        except:
            features['convex_hull_area'] = np.nan
            features['convex_hull_perimeter'] = np.nan
            features['hull_area_to_bbox_ratio'] = np.nan
    else:
        features['convex_hull_area'] = 0
        features['convex_hull_perimeter'] = 0
        features['hull_area_to_bbox_ratio'] = 0
    
    # Nearest neighbor distances
    if len(df_holds) >= 2:
        points = np.column_stack([xs, ys])
        distances = pdist(points)
        
        features['min_nn_distance'] = np.min(distances)
        features['mean_nn_distance'] = np.mean(distances)
        features['max_nn_distance'] = np.max(distances)
        features['std_nn_distance'] = np.std(distances)
    else:
        features['min_nn_distance'] = 0
        features['mean_nn_distance'] = 0
        features['max_nn_distance'] = 0
        features['std_nn_distance'] = 0
    
    # Clustering coefficient (holds grouped vs spread)
    if len(df_holds) >= 3:
        points = np.column_stack([xs, ys])
        dist_matrix = squareform(pdist(points))
        
        # Count neighbors within threshold (e.g., 12 inches)
        threshold = 12.0
        neighbors_count = (dist_matrix < threshold).sum(axis=1) - 1  # Exclude self
        features['mean_neighbors_12in'] = np.mean(neighbors_count)
        features['max_neighbors_12in'] = np.max(neighbors_count)
        
        # Clustering: ratio of actual neighbors to max possible
        avg_neighbors = np.mean(neighbors_count)
        max_possible = len(df_holds) - 1
        features['clustering_ratio'] = avg_neighbors / max_possible if max_possible > 0 else 0
    else:
        features['mean_neighbors_12in'] = 0
        features['max_neighbors_12in'] = 0
        features['clustering_ratio'] = 0
    
    # Path length (greedy nearest-neighbor tour)
    if len(df_holds) >= 2:
        # Sort by y (bottom to top) for approximate path
        sorted_indices = np.argsort(ys)
        sorted_points = np.column_stack([xs[sorted_indices], ys[sorted_indices]])
        
        path_length = 0
        for i in range(len(sorted_points) - 1):
            dx = sorted_points[i+1, 0] - sorted_points[i, 0]
            dy = sorted_points[i+1, 1] - sorted_points[i, 1]
            path_length += np.sqrt(dx**2 + dy**2)
        
        features['path_length_vertical'] = path_length
        features['path_efficiency'] = features['height_gained'] / max(path_length, 1)
    else:
        features['path_length_vertical'] = 0
        features['path_efficiency'] = 0
    
    # =====================================================
    # 16. DIFFICULTY-WEIGHTED FEATURES
    # =====================================================
    
    # Difficulty gradient (finish vs start)
    if pd.notna(features.get('finish_difficulty')) and pd.notna(features.get('start_difficulty')):
        features['difficulty_gradient'] = features['finish_difficulty'] - features['start_difficulty']
    else:
        features['difficulty_gradient'] = np.nan
    
    # Difficulty variance by vertical region (split into thirds)
    if len(difficulties) > 0:
        y_min_val, y_max_val = ys.min(), ys.max()
        y_range = y_max_val - y_min_val
        
        if y_range > 0:
            # Split into lower, middle, upper thirds
            lower_mask = ys <= (y_min_val + y_range / 3)
            middle_mask = (ys > y_min_val + y_range / 3) & (ys <= y_min_val + 2 * y_range / 3)
            upper_mask = ys > (y_min_val + 2 * y_range / 3)
            
            # Get difficulties for each region
            df_with_diff = df_holds.copy()
            df_with_diff['lower'] = lower_mask
            df_with_diff['middle'] = middle_mask
            df_with_diff['upper'] = upper_mask
            
            lower_diffs = df_with_diff[df_with_diff['lower'] & df_with_diff['difficulty'].notna()]['difficulty']
            middle_diffs = df_with_diff[df_with_diff['middle'] & df_with_diff['difficulty'].notna()]['difficulty']
            upper_diffs = df_with_diff[df_with_diff['upper'] & df_with_diff['difficulty'].notna()]['difficulty']
            
            features['lower_region_difficulty'] = lower_diffs.mean() if len(lower_diffs) > 0 else np.nan
            features['middle_region_difficulty'] = middle_diffs.mean() if len(middle_diffs) > 0 else np.nan
            features['upper_region_difficulty'] = upper_diffs.mean() if len(upper_diffs) > 0 else np.nan
            
            # Difficulty progression (upper - lower)
            if pd.notna(features['lower_region_difficulty']) and pd.notna(features['upper_region_difficulty']):
                features['difficulty_progression'] = features['upper_region_difficulty'] - features['lower_region_difficulty']
            else:
                features['difficulty_progression'] = np.nan
        else:
            features['lower_region_difficulty'] = features['mean_hold_difficulty']
            features['middle_region_difficulty'] = features['mean_hold_difficulty']
            features['upper_region_difficulty'] = features['mean_hold_difficulty']
            features['difficulty_progression'] = 0
    else:
        features['lower_region_difficulty'] = np.nan
        features['middle_region_difficulty'] = np.nan
        features['upper_region_difficulty'] = np.nan
        features['difficulty_progression'] = np.nan
    
    # Hardest move estimate (max difficulty jump between consecutive holds)
    if len(hand_holds) >= 2 and len(hand_diffs) >= 2:
        # Sort hand holds by y position
        hand_sorted = hand_holds.sort_values('y')
        hand_diff_sorted = hand_sorted['difficulty'].dropna().values
        
        if len(hand_diff_sorted) >= 2:
            difficulty_jumps = np.abs(np.diff(hand_diff_sorted))
            features['max_difficulty_jump'] = np.max(difficulty_jumps) if len(difficulty_jumps) > 0 else 0
            features['mean_difficulty_jump'] = np.mean(difficulty_jumps) if len(difficulty_jumps) > 0 else 0
        else:
            features['max_difficulty_jump'] = 0
            features['mean_difficulty_jump'] = 0
    else:
        features['max_difficulty_jump'] = 0
        features['mean_difficulty_jump'] = 0
    
    # Difficulty-weighted reach (combine difficulty with distances)
    if len(hand_holds) >= 2 and len(hand_diffs) >= 2:
        hand_sorted = hand_holds.sort_values('y')
        xs_sorted = hand_sorted['x'].values
        ys_sorted = hand_sorted['y'].values
        diffs_sorted = hand_sorted['difficulty'].fillna(hand_diffs.mean()).values
        
        weighted_reach = []
        for i in range(len(hand_sorted) - 1):
            dx = xs_sorted[i+1] - xs_sorted[i]
            dy = ys_sorted[i+1] - ys_sorted[i]
            dist = np.sqrt(dx**2 + dy**2)
            avg_diff = (diffs_sorted[i] + diffs_sorted[i+1]) / 2
            weighted_reach.append(dist * avg_diff)
        
        features['difficulty_weighted_reach'] = np.mean(weighted_reach) if weighted_reach else 0
        features['max_weighted_reach'] = np.max(weighted_reach) if weighted_reach else 0
    else:
        features['difficulty_weighted_reach'] = 0
        features['max_weighted_reach'] = 0
    
    # =====================================================
    # 17. POSITION-NORMALIZED FEATURES
    # =====================================================
    
    # Normalized positions (0-1 scale)
    features['mean_x_normalized'] = (features['mean_x'] - x_min) / board_width
    features['mean_y_normalized'] = (features['mean_y'] - y_min) / board_height
    features['std_x_normalized'] = features['std_x'] / board_width
    features['std_y_normalized'] = features['std_y'] / board_height
    
    # Start/finish normalized
    if pd.notna(features.get('start_height')):
        features['start_height_normalized'] = (features['start_height'] - y_min) / board_height
    else:
        features['start_height_normalized'] = np.nan
    
    if pd.notna(features.get('finish_height')):
        features['finish_height_normalized'] = (features['finish_height'] - y_min) / board_height
    else:
        features['finish_height_normalized'] = np.nan
    
    # Distance from typical positions (center bottom for start, center top for finish)
    typical_start_y = y_min + board_height * 0.15
    typical_finish_y = y_min + board_height * 0.85
    
    if pd.notna(features.get('start_height')):
        features['start_offset_from_typical'] = abs(features['start_height'] - typical_start_y)
    else:
        features['start_offset_from_typical'] = np.nan
    
    if pd.notna(features.get('finish_height')):
        features['finish_offset_from_typical'] = abs(features['finish_height'] - typical_finish_y)
    else:
        features['finish_offset_from_typical'] = np.nan
    
    # Hold positions relative to start
    if len(start_holds) > 0:
        start_y = start_holds['y'].mean()
        features['mean_y_relative_to_start'] = features['mean_y'] - start_y
        features['max_y_relative_to_start'] = features['max_y'] - start_y
    else:
        features['mean_y_relative_to_start'] = np.nan
        features['max_y_relative_to_start'] = np.nan
    
    # Spread normalized by board
    features['spread_x_normalized'] = features['range_x'] / board_width
    features['spread_y_normalized'] = features['range_y'] / board_height
    
    # Bbox coverage (percentage of board covered)
    features['bbox_coverage_x'] = features['range_x'] / board_width
    features['bbox_coverage_y'] = features['range_y'] / board_height
    
    # Position quartile features
    y_quartiles = np.percentile(ys, [25, 50, 75])
    features['y_q25'] = y_quartiles[0]
    features['y_q50'] = y_quartiles[1]
    features['y_q75'] = y_quartiles[2]
    features['y_iqr'] = y_quartiles[2] - y_quartiles[0]
    
    # Holds in each vertical quartile
    features['holds_bottom_quartile'] = (ys < y_quartiles[0]).sum()
    features['holds_top_quartile'] = (ys >= y_quartiles[2]).sum()
    
    return features

extract_features(df_climbs.iloc[10000], placement_coords, df_placements)

"""
==================================
Extract features for all climbs
==================================
"""

from tqdm import tqdm # Progess bar. This will take a while.

print(f"Extracting features for {len(df_climbs)} climbs...")

feature_list = []

for idx, row in tqdm(df_climbs.iterrows(), total=len(df_climbs)):
    features = extract_features(row, placement_coords, df_hold_difficulty)
    if features:
        features['climb_uuid'] = row['uuid']
        features['display_difficulty'] = row['display_difficulty']
        feature_list.append(features)

df_features = pd.DataFrame(feature_list)
df_features = df_features.set_index('climb_uuid')

print(f"\nExtracted features for {len(df_features)} climbs")
print(f"Feature columns: {len(df_features.columns)}")

print("\n### Feature Table Sample\n")
display(df_features.head(10))

"""
==================================
Feature Summary Statistics
==================================
"""

print("### Feature Summary\n")

summary = df_features.describe().T
summary['missing'] = df_features.isna().sum()
summary['missing_pct'] = (df_features.isna().sum() / len(df_features) * 100).round(2)

display(summary[['count', 'mean', 'std', 'min', 'max', 'missing', 'missing_pct']])

"""
==================================
Correlation with Difficulty
==================================
"""

correlations = df_features.corr()['display_difficulty'].drop('display_difficulty').sort_values(key=abs, ascending=False)

print("### Top 30 Features Correlated with Difficulty\n")
display(correlations.head(30).to_frame('correlation'))

print("\n### Bottom 10 Features (Least Correlated)\n")
display(correlations.tail(10).to_frame('correlation'))

"""
==================================
Visualize Key Features
==================================
"""

import matplotlib.pyplot as plt
import seaborn as sns

fig, axes = plt.subplots(4, 4, figsize=(16, 16))

key_features = [
    'angle',
    'total_holds',
    'height_gained',
    'mean_hold_difficulty',
    'max_hold_difficulty',
    'mean_hand_reach',
    'hold_density',
    'symmetry_score',
    'is_nomatch',
    'convex_hull_area',
    'difficulty_progression',
    'mean_y_normalized',
    'clustering_ratio',
    'path_efficiency',
    'max_difficulty_jump',
    'difficulty_weighted_reach'
]

for ax, feature in zip(axes.flat, key_features):
    if feature in df_features.columns:
        ax.scatter(df_features[feature], df_features['display_difficulty'], alpha=0.3, s=10)
        ax.set_xlabel(feature)
        ax.set_ylabel('Difficulty')
        ax.set_title(f'{feature} vs Difficulty')

plt.tight_layout()
plt.savefig('../images/04_climb_features/feature_correlations.png', dpi=150, bbox_inches='tight')
plt.show()

"""
==================================
Add Interaction Features
==================================
"""

# Angle interactions
df_features['angle_x_holds'] = df_features['angle'] * df_features['total_holds']
df_features['angle_x_difficulty'] = df_features['angle'] * df_features['mean_hold_difficulty'].fillna(0)
df_features['angle_squared'] = df_features['angle'] ** 2

# Difficulty interactions
df_features['difficulty_x_height'] = df_features['mean_hold_difficulty'].fillna(0) * df_features['height_gained']
df_features['difficulty_x_density'] = df_features['mean_hold_difficulty'].fillna(0) * df_features['hold_density']

# Complexity features
df_features['complexity_score'] = (
    df_features['total_holds'] * 
    df_features['mean_hand_reach'].fillna(0) * 
    df_features['hold_density']
)

# Geometric × difficulty
df_features['hull_area_x_difficulty'] = df_features['convex_hull_area'].fillna(0) * df_features['mean_hold_difficulty'].fillna(0)

print(f"Added interaction features. Total columns: {len(df_features.columns)}")

"""
==================================
Handle Missing Values
==================================
"""

missing = df_features.isna().sum()
missing_cols = missing[missing > 0]

print("### Columns with Missing Values\n")
display(missing_cols.to_frame('missing'))

# Fill difficulty NaNs with column mean
difficulty_cols = [c for c in df_features.columns if 'difficulty' in c.lower()]
for col in difficulty_cols:
    if df_features[col].isna().any():
        df_features[col] = df_features[col].fillna(df_features[col].mean())

# Fill start/finish height with min_y/max_y if missing
df_features['start_height'] = df_features['start_height'].fillna(df_features['min_y'])
df_features['finish_height'] = df_features['finish_height'].fillna(df_features['max_y'])

# Fill normalized features
df_features['start_height_normalized'] = df_features['start_height_normalized'].fillna(
    (df_features['start_height'] - y_min) / board_height
)
df_features['finish_height_normalized'] = df_features['finish_height_normalized'].fillna(
    (df_features['finish_height'] - y_min) / board_height
)

# Fill other NaNs with column means
for col in df_features.columns:
    if df_features[col].isna().any():
        if df_features[col].dtype in ['float64', 'int64']:
            df_features[col] = df_features[col].fillna(df_features[col].mean())

# Check remaining missing
remaining = df_features.isna().sum().sum()
print(f"\nRemaining missing values: {remaining}")

"""
===================================
Feature Importance Review
===================================
"""

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

X = df_features.drop(columns=['display_difficulty'])
y = df_features['display_difficulty']

rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=3, n_jobs=-1)
rf.fit(X, y)

importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("### Top 30 Most Important Features (Random Forest)\n")
display(importance.head(30))

# Cross-validation score
scores = cross_val_score(rf, X, y, cv=5, scoring='neg_mean_absolute_error')
print(f"\nCross-validated MAE: {-scores.mean():.2f} (+/- {scores.std():.2f})")

"""
============================
Save Feature Matrix
============================
"""
raw_cols = [c for c in df_features.columns if c.endswith('_raw')]
if raw_cols:
    print("Dropping raw columns from final climb feature matrix:")
    print(raw_cols)
    df_features = df_features.drop(columns=raw_cols)

# `climb_features.csv` is the canonical name used by later notebooks.
df_features.to_csv('../data/04_climb_features/climb_features.csv')

print("Saved feature matrix to:")
print("  - ../data/04_climb_features/climb_features.csv")

with open('../data/04_climb_features/feature_list.txt', 'w') as f:
    for col in df_features.columns:
        f.write(f"{col}\n")

print("\nFeature list saved to ../data/04_climb_features/feature_list.txt")

"""
==================================
Final Feature Summary
==================================
"""

print("### Feature Engineering Complete\n")
print(f"Total climbs: {len(df_features)}")
print(f"Total features: {df_features.shape[1] - 1}")  # Exclude target
print(f"Target: display_difficulty")
print(f"Feature matrix shape: {df_features.shape}")

print("""\nInterpretation:
- Each row is a climb-angle observation.
- The target is `display_difficulty`.
- The predictors combine geometry, hold statistics, and aggregate difficulty information.
- Hold-difficulty-based features use Bayesian-smoothed hold scores from Notebook 03.
- The next notebook tests how much predictive signal these engineered features actually contain.
""")

Tension Board 2 Mirror: Feature Engineering¶

Modelling idea¶

Output¶

Notebook Structure¶

Setup and Imports¶

Feature Extraction¶

Sanity Check on One Example¶

Extract Features or all climbs¶

Correlation with Difficulty¶

Visualizing Key Features¶

Conclusion¶