import numpy as np
import matplotlib.pyplot as plt

# 1. Generate some synthetic data
# We set a random seed for reproducibility
np.random.seed(3)

# Create 50 random x values between 0 and 10
x = np.random.uniform(0, 10, 50)

# Create y values with a linear relationship plus some random noise
# True relationship: y = 2.5x + 5 + noise
noise = np.random.normal(0, 2, 50)
y = 2.5 * x + 5 + noise

# 2. Calculate the line of best fit
# np.polyfit(x, y, deg) returns the coefficients for the polynomial
# deg=1 specifies a linear fit (first degree polynomial)
slope, intercept = np.polyfit(x, y, 1)

# Create a polynomial function from the coefficients
# This allows us to pass x values directly to get predicted y values
fit_function = np.poly1d((slope, intercept))

# Generate x values for plotting the line (smoothly across the range)
x_line = np.linspace(x.min(), x.max(), 100)
y_line = fit_function(x_line)

# 3. Plot the data and the line of best fit
plt.figure(figsize=(10, 6))

# Plot the scatter points
plt.scatter(x, y, color='purple', label='Data Points', alpha=0.7)

# Plot the line of best fit
plt.plot(x_line, y_line, color='steelblue', linestyle='--', linewidth=2, label='Line of Best Fit')

# Add labels and title
plt.xlabel('X Axis')
plt.ylabel('Y Axis')
plt.title('Scatter Plot with Line of Best Fit')

# Add the equation to the plot
# The f-string formats the slope and intercept to 2 decimal places
plt.text(1, 25, f'y = {slope:.2f}x + {intercept:.2f}', fontsize=12, bbox=dict(facecolor='white', alpha=0.8))

# Display legend and grid
plt.legend()
plt.grid(True, linestyle=':', alpha=0.6)

# Show the plot
plt.savefig('../images/line_of_best_fit_generated_1.png')
plt.show()

import numpy as np
import matplotlib.pyplot as plt

# Linear algebra helper functions
def proj_onto_subspace(A, v):
    """
    Project vector v onto Col(A) where A is (3 x k) with columns spanning the subspace.
    Uses the formula: P = A (A^T A)^(-1) A^T (for full column rank A).
    """
    AtA = A.T @ A
    return A @ np.linalg.solve(AtA, A.T @ v)

def make_plane_grid(a, b, u_range=(-1.5, 1.5), v_range=(-1.5, 1.5), n=15):
    """
    Plane through origin spanned by vectors a and b.
    Returns meshgrid points X,Y,Z for surface plotting.
    """
    uu = np.linspace(*u_range, n)
    vv = np.linspace(*v_range, n)
    U, V = np.meshgrid(uu, vv)
    P = U[..., None] * a + V[..., None] * b   # shape (n,n,3)
    return P[..., 0], P[..., 1], P[..., 2]

# Choose a plan and a vector
# Plane basis vectors (span a 2D subspace in R^3)
a = np.array([1.0, 0.2, 0.0])
b = np.array([0.2, 1.0, 0.3])
# Create the associated matrix
# 3x2 matrix of full column rank
# the column space will be a plane
A = np.column_stack([a, b]) 

# Vector to project
v = np.array([0.8, 0.6, 1.2])

# Projection and residual
p = proj_onto_subspace(A, v)
r = v - p

# Plot
fig = plt.figure(figsize=(9, 7))
# 1 row, 1 column, 1 subplot
# axis lives in R^3
ax = fig.add_subplot(111, projection="3d")

# Plane surface
X, Y, Z = make_plane_grid(a, b)
# Here is a rectangular grid of points in 3D; draw a surface through them.
ax.plot_surface(X, Y, Z, alpha=0.25)

origin = np.zeros(3)

# v, p, and residual r
ax.quiver(*origin, *v, arrow_length_ratio=0.08, linewidth=2)
ax.quiver(*origin, *p, arrow_length_ratio=0.08, linewidth=2)
ax.quiver(*p, *r, arrow_length_ratio=0.08, linewidth=2)

# Drop line from v to its projection on the plane
ax.plot([v[0], p[0]],
		[v[1], p[1]],
		[v[2], p[2]],
		linestyle="--", linewidth=2)

# Points for emphasis
ax.scatter(*v, s=60)
ax.scatter(*p, s=60)

# Labels (simple text)
ax.text(*v, "  v")
ax.text(*p, "  Proj(v)")

# Make axes look nice
ax.set_xlabel("x")
ax.set_ylabel("y")
ax.set_zlabel("z")
ax.set_title("Projection of a vector onto a plane")

# Set symmetric limits so the picture isn't squished
all_pts = np.vstack([origin, v, p])
m = np.max(np.abs(all_pts)) * 1.3 + 0.2
ax.set_xlim(-m, m)
ax.set_ylim(-m, m)
ax.set_zlim(-m, m)

# Adjust spacing so labels, titles, and axes don’t overlap or get cut off.
plt.tight_layout()
plt.savefig('../images/projection_of_vector_onto_plane.png')
plt.show()

import numpy as np

# Define the matrix X and vector y
X = np.array([[1], [2], [3], [4]])
y = np.array([[1], [2], [2], [4]])

# Augment X with a column of 1's (intercept)
X_aug = np.hstack((np.ones((X.shape[0], 1)), X))

# Solve the normal equations
beta = np.linalg.solve(X_aug.T @ X_aug, X_aug.T @ y)

beta

array([[-1.0658141e-15],
       [ 9.0000000e-01]])

import matplotlib.pyplot as plt
b, m  = beta #beta[0] will be the intercept and beta[1] will be the slope
_ = plt.plot(X, y, 'o', label='Original data', markersize=10)
_ = plt.plot(X, m*X + b, 'r', label='Line of best fit')
_ = plt.legend()
plt.savefig('../images/line_of_best_fit_easy_example.png')
plt.show()

import numpy as np

# Define the matrix X and vector y
X = np.array([[1], [2], [3], [4]])
y = np.array([[1], [2], [2], [4]])

# Augment X with a column of 1's (intercept)
X_aug = np.hstack((np.ones((X.shape[0], 1)), X))

# Solve the least squares equation with matrix X_aug and target y
beta = np.linalg.lstsq(X_aug,y)[0]

beta

array([[6.16291085e-16],
       [9.00000000e-01]])

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# First let us make a dictionary incorporating our data.
# Each entry corresponds to a column (feature of our data)
data = {
	'Square ft': [1600, 2100, 1550, 1600, 2000],
	'Bedrooms': [3, 4, 2, 3, 4],
	'Price': [500, 650, 475, 490, 620]
}

# Create a pandas DataFrame
df = pd.DataFrame(data)

df

df.describe()

df[["Square ft", "Bedrooms", "Price"]].corr()

# Scatter plot for Price vs Square ft
df.plot(
	kind="scatter",
	x="Square ft",
	y="Price",
	title="House Price vs Square footage"
)
plt.savefig('../images/house_price_vs_square_ft.png')
plt.show()

# Scatter plot for Price vs Bedrooms
df.plot(
	kind="scatter",
	x="Bedrooms",
	y="Price",
	title="House Price vs Bedrooms"
)
plt.savefig('../images/house_price_vs_bedrooms.png')
plt.show()

# Scatter plot for Bedrooms vs Square ft
df.plot(
	kind="scatter",
	x="Square ft",
	y="Bedrooms",
	title="Bedrooms vs Square footage"
)
plt.savefig('../images/bedrooms_vs_square_ft.png')
plt.show()

# Create our matrix X and our target y
X = df[["Square ft", "Bedrooms"]].to_numpy()
y = df[["Price"]].to_numpy()

# Augment X with a column of 1's (intercept)
X_aug = np.hstack((np.ones((X.shape[0], 1)), X))

# Solve the least-squares problem
beta = np.linalg.lstsq(X_aug,y)[0]

beta

array([[4.0098513e-13],
       [3.0000000e-01],
       [5.0000000e+00]])

## Generate data
import numpy as np
import matplotlib.pyplot as plt

# 1) Generate quadratic data
np.random.seed(3)

n = 50
x = np.random.uniform(-5, 5, n)   # symmetric, wider range

# True relationship: y = ax^2 + c + noise
a_true = 2.0
c_true = 5.0
noise = np.random.normal(0, 3, n)

y = a_true * x**2 + c_true + noise

## Generate scatter plot
plt.scatter(x,y)

# plot it
plt.savefig('../images/quadratic_data_generated_1.png')
plt.show()

# find a line of best fit
a,b = np.polyfit(x, y, 1)

# add scatter points to plot
plt.scatter(x,y)

# add line of best fit to plot
plt.plot(x, a*x + b, 'r', linewidth=1)

# plot it
plt.savefig('../images/quadratic_data_line_of_best_fit.png')
plt.show()

# polynomial fit with degree = 2
poly = np.polyfit(x,y,2)
model = np.poly1d(poly)

# add scatter points to plot
plt.scatter(x,y)

# add the quadratic to the plot
polyline=np.linspace(x.min(), x.max())
plt.plot(polyline, model(polyline), 'r', linewidth=1)


# plot it
plt.savefig('../images/quadratic_data_quadratic_of_best_fit')
plt.show()

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# First let us make a dictionary incorporating our data.
# Each entry corresponds to a column (feature of our data)
data = {
    'Square ft': [1600, 2100, 1550, 1600, 2000],
    'Bedrooms': [3, 4, 2, 3, 4],
    'Price': [500, 650, 475, 490, 620]
}

# Create a pandas DataFrame
df = pd.DataFrame(data)

# Extract x (square footage) and y (price)
x = df["Square ft"].to_numpy(dtype=float)
y = df["Price"].to_numpy(dtype=float)

# Degree of polynomial
degree = 3 # cubic

# Polyfit directly on x
cubic = np.poly1d(np.polyfit(x,y, degree))

# Add fitted polynomial line and scatter plot
polyline = np.linspace(x.min(),x.max())
plt.scatter(x,y, label="Observed data")
plt.plot(polyline, cubic(polyline), 'r', label="Cubic best fit")
plt.xlabel("Square ft")
plt.ylabel("Price (in $1000s)")
plt.title("Cubic polynomial regression: Price vs Square Footage")
plt.show()

cubic

poly1d([ 3.08080808e-07, -1.78106061e-03,  3.71744949e+00, -2.15530303e+03])

import numpy as np
import matplotlib.pyplot as plt

# Generate data (same as above)
np.random.seed(3)
x = np.random.uniform(0, 10, 50)
y = 2.5 * x + 5 + np.random.normal(0, 2, 50)

# Calculate slope and intercept
slope, intercept = np.polyfit(x, y, 1)

plt.figure(figsize=(10, 6))
plt.scatter(x, y, color='purple', label='Data Points', alpha=0.7)
# Plot the line using axline
# xy1=(0, intercept) is the y-intercept point

# slope=slope defines the steepness
plt.axline(xy1=(0, intercept), slope=slope, color='steelblue', linestyle='--', linewidth=2, label='Line of Best Fit')

# Add the equation to the plot
# The f-string formats the slope and intercept to 2 decimal places
plt.text(1, 25, f'y = {slope:.2f}x + {intercept:.2f}', fontsize=12, bbox=dict(facecolor='white', alpha=0.8))


plt.xlabel('X Axis')
plt.ylabel('Y Axis')
plt.title('Scatter Plot with Line of Best Fit')
plt.legend()
plt.grid(True, linestyle=':', alpha=0.6)
plt.show()

Data Science (DS) Term	Linear Algebra (LA) Equivalent	Explanation
Dataset (with n observations and p features)	A matrix $X \in \mathbb{R}^{n \times p}$	The dataset is just a matrix. Each row is an observation (a vector of features). Each column is a feature (a vector of its values across all observations).
Features	Columns of $X$	Each feature is a column in your data matrix.
Observation	Rows of $X$	Each data point corresponds to a row.
Targets	A vector $y \in \mathbb{R}^{n \times 1}$	The list of all target values is a column vector.
Model parameters	A vector $\beta \in \mathbb{R}^{p \times 1}$	These are the unknown coefficients.
Model	Matrix–vector equation	The relationship becomes an equation involving matrices and vectors.
Prediction Error / Residuals	A residual vector $e \in \mathbb{R}^{n \times 1}$	Difference between actual targets and predictions.
Training / "best fit"	Optimization: minimizing the norm of the residual vector	To find the "best" model by finding a model which makes the norm of the residual vector as small as possible.

	Square ft	Bedrooms	Price
count	5.000000	5.00000	5.000000
mean	1770.000000	3.20000	547.000000
std	258.843582	0.83666	81.516869
min	1550.000000	2.00000	475.000000
25%	1600.000000	3.00000	490.000000
50%	1600.000000	3.00000	500.000000
75%	2000.000000	4.00000	620.000000
max	2100.000000	4.00000	650.000000

	Square ft	Bedrooms	Price
Square ft	1.000000	0.900426	0.998810
Bedrooms	0.900426	1.000000	0.909066
Price	0.998810	0.909066	1.000000

Least Squares Regression: A Linear Algebra Perspective¶

Introduction¶

Translation from Data Science to Linear Algebra¶

Solving the problem: Least Squares Regression and Matrix Decompositions¶

Least Squares Solution¶

Polynomial Regression¶

Additional visualization: line of best fit¶

Generated scatter plot example¶

	Square ft	Bedrooms	Price
0	1600	3	500
1	2100	4	650
2	1550	2	475
3	1600	3	490
4	2000	4	620