import pandas as pd

df = pd.read_csv('Salary_Data.csv')
df.head(10)

X = df.iloc[:, :-1].values 
y = df.iloc[:, -1].values

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1/3, random_state = 0)

from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

y_pred = regressor.predict(X_test)

import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R² Score: {r2:.2f}")

Mean Absolute Error (MAE): 3426.43
Mean Squared Error (MSE): 21026037.33
Root Mean Squared Error (RMSE): 4585.42
R² Score: 0.97

import seaborn as sns
import matplotlib.pyplot as plt

def plot_linear_regression(X, y, x_label='X', y_label='Y', title=None, 
                           caption_text="© 2025 Thomas Uhuru"):
    """
    Plots a linear regression using Seaborn with separate X and y inputs.
    Works whether X/y are lists, Series, or 2D arrays.
    """

    # Flatten
    X = np.array(X).ravel()
    y = np.array(y).ravel()

    # Create DataFrame
    df = pd.DataFrame({x_label: X, y_label: y})

    # Plot
    plt.figure(figsize=(8, 5))
    sns.regplot(
        x=x_label,
        y=y_label,
        data=df,
        scatter_kws={'color': 'red', 's': 50, 'alpha': 0.8},
        line_kws={'color': 'blue', 'linewidth': 2}
    )

    plt.title(title or f'{y_label} vs {x_label}', fontsize=14)
    plt.xlabel(x_label, fontsize=12)
    plt.ylabel(y_label, fontsize=12)

    # Add caption
    plt.figtext(
        0.99, 0.01,
        caption_text,
        ha="right", va="bottom",
        fontsize=9, color="gray", style="italic"
    )

    plt.tight_layout()
    plt.show()

plot_linear_regression(X_train, y_train, 
                       x_label='Years of Experience', 
                       y_label='Salary', title='Salary vs Experience (Training Data)')

plot_linear_regression(X_train, regressor.predict(X_train), 
                       x_label='Years of Experience', 
                       y_label='Salary', title='Salary vs Experience (Test Data)')

experience = 7
predicted_salary = regressor.predict(np.array([[experience]]))[0]
print(predicted_salary)

92237.78934588778

experience = 4.5
predicted_salary = regressor.predict(np.array([[experience]]))[0]
print(predicted_salary)

68872.93323808187

	YearsExperience	Salary
0	1.1	39343.0
1	1.3	46205.0
2	1.5	37731.0
3	2.0	43525.0
4	2.2	39891.0
5	2.9	56642.0
6	3.0	60150.0
7	3.2	54445.0
8	3.2	64445.0
9	3.7	57189.0

	fit_intercept	True
	copy_X	True
	tol	1e-06
	n_jobs	None
	positive	False

Simple Linear Regression¶

Importing the dataset¶

Splitting the dataset into the Training set and Test set¶

Training the Simple Linear Regression model on the Training set¶

Predicting the Test set results¶

Model Performance Evaluation¶

Interpretation of Results¶

1. Mean Absolute Error (MAE): 3426.43¶

2. Mean Squared Error (MSE): 21,026,037.33¶

3. Root Mean Squared Error (RMSE): 4,585.42¶

R² Score: 0.97¶

Overall Interpretation¶

Visualising the Results¶

(a) The Training set results¶

(b) The Test set results¶

Predicting Salary from Years of Experience¶