The code here uses Ridge regression with cross-validation (CV) resampling and \( k \)-fold CV in order to fit a specific polynomial.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import PolynomialFeatures
# A seed just to ensure that the random numbers are the same for every run.
# Useful for eventual debugging.
np.random.seed(3155)
# Generate the data.
nsamples = 100
x = np.random.randn(nsamples)
y = 3*x**2 + np.random.randn(nsamples)
## Cross-validation on Ridge regression using KFold only
# Decide degree on polynomial to fit
poly = PolynomialFeatures(degree = 6)
# Decide which values of lambda to use
nlambdas = 500
lambdas = np.logspace(-3, 5, nlambdas)
# Initialize a KFold instance
k = 5
kfold = KFold(n_splits = k)
# Perform the cross-validation to estimate MSE
scores_KFold = np.zeros((nlambdas, k))
i = 0
for lmb in lambdas:
ridge = Ridge(alpha = lmb)
j = 0
for train_inds, test_inds in kfold.split(x):
xtrain = x[train_inds]
ytrain = y[train_inds]
xtest = x[test_inds]
ytest = y[test_inds]
Xtrain = poly.fit_transform(xtrain[:, np.newaxis])
ridge.fit(Xtrain, ytrain[:, np.newaxis])
Xtest = poly.fit_transform(xtest[:, np.newaxis])
ypred = ridge.predict(Xtest)
scores_KFold[i,j] = np.sum((ypred - ytest[:, np.newaxis])**2)/np.size(ypred)
j += 1
i += 1
estimated_mse_KFold = np.mean(scores_KFold, axis = 1)
## Cross-validation using cross_val_score from sklearn along with KFold
# kfold is an instance initialized above as:
# kfold = KFold(n_splits = k)
estimated_mse_sklearn = np.zeros(nlambdas)
i = 0
for lmb in lambdas:
ridge = Ridge(alpha = lmb)
X = poly.fit_transform(x[:, np.newaxis])
estimated_mse_folds = cross_val_score(ridge, X, y[:, np.newaxis], scoring='neg_mean_squared_error', cv=kfold)
# cross_val_score return an array containing the estimated negative mse for every fold.
# we have to the the mean of every array in order to get an estimate of the mse of the model
estimated_mse_sklearn[i] = np.mean(-estimated_mse_folds)
i += 1
## Plot and compare the slightly different ways to perform cross-validation
plt.figure()
plt.plot(np.log10(lambdas), estimated_mse_sklearn, label = 'cross_val_score')
plt.plot(np.log10(lambdas), estimated_mse_KFold, 'r--', label = 'KFold')
plt.xlabel('log10(lambda)')
plt.ylabel('mse')
plt.legend()
plt.show()