import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

df = pd.read_csv('50_Startups.csv')
df.head(10)

X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

print(X[0:3]) #We have string values in the last column.

[[165349.2 136897.8 471784.1 'New York']
 [162597.7 151377.59 443898.53 'California']
 [153441.51 101145.55 407934.54 'Florida']]

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

print(X[0:3]) #The column has been encoded

[[0.0 0.0 1.0 165349.2 136897.8 471784.1]
 [1.0 0.0 0.0 162597.7 151377.59 443898.53]
 [0.0 1.0 0.0 153441.51 101145.55 407934.54]]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[103015.2  103282.38]
 [132582.28 144259.4 ]
 [132447.74 146121.95]
 [ 71976.1   77798.83]
 [178537.48 191050.39]
 [116161.24 105008.31]
 [ 67851.69  81229.06]
 [ 98791.73  97483.56]
 [113969.44 110352.25]
 [167921.07 166187.94]]

import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R² Score: {r2:.2f}")

Mean Absolute Error (MAE): 7514.29
Mean Squared Error (MSE): 83502864.03
Root Mean Squared Error (RMSE): 9137.99
R² Score: 0.93

# After model training
feature_names = list(ct.get_feature_names_out())  # Get encoded + passthrough feature names

# Combine feature names with coefficients
coefficients = pd.DataFrame({
    "Feature": feature_names,
    "Coefficient": regressor.coef_
})

coefficients

# Input data (same structure as training data)
new_data = np.array([[160000, 130000, 400000, 'California']], dtype=object)

# Encode and predict
new_data_encoded = ct.transform(new_data)
predicted_profit = regressor.predict(new_data_encoded)

# Display answer
print(f"Predicted Profit: ${predicted_profit[0]:,.2f}")

Predicted Profit: $185,227.93

import seaborn as sns
import matplotlib.pyplot as plt

# Sort by absolute coefficient size
coef_df = coefficients.reindex(coefficients.Coefficient.abs().sort_values(ascending=False).index)

plt.figure(figsize=(8, 5))

# Assign hue to same as y and disable legend to avoid duplication
sns.barplot(
    x="Coefficient",
    y="Feature",
    hue="Feature",           # Added hue for coloring
    data=coef_df,
    palette="coolwarm",
    legend=False             # Disable redundant legend
)

# Labels & title
plt.title("Feature Importance (Linear Regression Coefficients)", fontsize=14)
plt.xlabel("Coefficient Value", fontsize=12)
plt.ylabel("Feature", fontsize=12)
plt.grid(True, linestyle='--', alpha=0.4)

# Copyright footer
plt.figtext(0.99, 0.01, "thomasuhuru.com", ha="right", fontsize=9, color="gray")

plt.tight_layout()
plt.show()

	R&D Spend	Administration	Marketing Spend	State	Profit
0	165349.20	136897.80	471784.10	New York	192261.83
1	162597.70	151377.59	443898.53	California	191792.06
2	153441.51	101145.55	407934.54	Florida	191050.39
3	144372.41	118671.85	383199.62	New York	182901.99
4	142107.34	91391.77	366168.42	Florida	166187.94
5	131876.90	99814.71	362861.36	New York	156991.12
6	134615.46	147198.87	127716.82	California	156122.51
7	130298.13	145530.06	323876.68	Florida	155752.60
8	120542.52	148718.95	311613.29	New York	152211.77
9	123334.88	108679.17	304981.62	California	149759.96

	fit_intercept	True
	copy_X	True
	tol	1e-06
	n_jobs	None
	positive	False

	Feature	Coefficient
0	encoder__x3_California	86.638369
1	encoder__x3_Florida	-872.645791
2	encoder__x3_New York	786.007422
3	remainder__x0	0.773467
4	remainder__x1	0.032885
5	remainder__x2	0.036610

Multiple Linear Regression¶

Importing the libraries¶

Importing the dataset¶

Encoding categorical data¶

Splitting the dataset into the Training set and Test set¶

Training the Multiple Linear Regression model on the Training set¶

Predicting the Test set results¶

Model Performance Evaluation¶

Intepret Model Coefficients¶

1. Location (Categorical Features)¶

2. Numerical Predictors (Continuous Variables)¶

Insights¶

Predict new data¶

Visualize Model Coefficients¶