One of the aims is to reproduce Figure 2.11 of Hastie et al.
Our data is defined by \( x\in [-3,3] \) with a total of for example \( 100 \) data points.
np.random.seed()
n = 100
maxdegree = 14
# Make data set.
x = np.linspace(-3, 3, n).reshape(-1, 1)
y = np.exp(-x**2) + 1.5 * np.exp(-(x-2)**2)+ np.random.normal(0, 0.1, x.shape)
where \( y \) is the function we want to fit with a given polynomial.
Write a first code which sets up a design matrix \( X \) defined by a fourth-order polynomial. Scale your data and split it in training and test data.
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
np.random.seed(2018)
n = 50
maxdegree = 5
# Make data set.
x = np.linspace(-3, 3, n).reshape(-1, 1)
y = np.exp(-x**2) + 1.5 * np.exp(-(x-2)**2)+ np.random.normal(0, 0.1, x.shape)
TestError = np.zeros(maxdegree)
TrainError = np.zeros(maxdegree)
polydegree = np.zeros(maxdegree)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
scaler = StandardScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)
for degree in range(maxdegree):
model = make_pipeline(PolynomialFeatures(degree=degree), LinearRegression(fit_intercept=False))
clf = model.fit(x_train_scaled,y_train)
y_fit = clf.predict(x_train_scaled)
y_pred = clf.predict(x_test_scaled)
polydegree[degree] = degree
TestError[degree] = np.mean( np.mean((y_test - y_pred)**2) )
TrainError[degree] = np.mean( np.mean((y_train - y_fit)**2) )
plt.plot(polydegree, TestError, label='Test Error')
plt.plot(polydegree, TrainError, label='Train Error')
plt.legend()
plt.show()