To more efficently train our network these equations are implemented using matrix operations. The error in the output layer is calculated simply as, with \( \boldsymbol{t} \) being our targets,
$$ \delta_L = \boldsymbol{t} - \boldsymbol{y} = (n_{inputs}, n_{categories}) .$$
The gradient for the output weights is calculated as
$$ \nabla W_{L} = \boldsymbol{a}^T \delta_L = (n_{hidden}, n_{categories}) ,$$
where \( \boldsymbol{a} = (n_{inputs}, n_{hidden}) \). This simply means that we are summing up the gradients for each input. Since we are going backwards we have to transpose the activation matrix.
The gradient with respect to the output bias is then
$$ \nabla \boldsymbol{b}_{L} = \sum_{i=1}^{n_{inputs}} \delta_L = (n_{categories}) .$$
The error in the hidden layer is
$$ \Delta_h = \delta_L W_{L}^T \circ f'(z_{h}) = \delta_L W_{L}^T \circ a_{h} \circ (1 - a_{h}) = (n_{inputs}, n_{hidden}) ,$$
where \( f'(a_{h}) \) is the derivative of the activation in the hidden layer. The matrix products mean that we are summing up the products for each neuron in the output layer. The symbol \( \circ \) denotes the Hadamard product, meaning element-wise multiplication.
This again gives us the gradients in the hidden layer:
$$ \nabla W_{h} = X^T \delta_h = (n_{features}, n_{hidden}) ,$$
$$ \nabla b_{h} = \sum_{i=1}^{n_{inputs}} \delta_h = (n_{hidden}) .$$
# to categorical turns our integer vector into a onehot representation
from sklearn.metrics import accuracy_score
# one-hot in numpy
def to_categorical_numpy(integer_vector):
n_inputs = len(integer_vector)
n_categories = np.max(integer_vector) + 1
onehot_vector = np.zeros((n_inputs, n_categories))
onehot_vector[range(n_inputs), integer_vector] = 1
return onehot_vector
#Y_train_onehot, Y_test_onehot = to_categorical(Y_train), to_categorical(Y_test)
Y_train_onehot, Y_test_onehot = to_categorical_numpy(Y_train), to_categorical_numpy(Y_test)
def feed_forward_train(X):
# weighted sum of inputs to the hidden layer
z_h = np.matmul(X, hidden_weights) + hidden_bias
# activation in the hidden layer
a_h = sigmoid(z_h)
# weighted sum of inputs to the output layer
z_o = np.matmul(a_h, output_weights) + output_bias
# softmax output
# axis 0 holds each input and axis 1 the probabilities of each category
exp_term = np.exp(z_o)
probabilities = exp_term / np.sum(exp_term, axis=1, keepdims=True)
# for backpropagation need activations in hidden and output layers
return a_h, probabilities
def backpropagation(X, Y):
a_h, probabilities = feed_forward_train(X)
# error in the output layer
error_output = probabilities - Y
# error in the hidden layer
error_hidden = np.matmul(error_output, output_weights.T) * a_h * (1 - a_h)
# gradients for the output layer
output_weights_gradient = np.matmul(a_h.T, error_output)
output_bias_gradient = np.sum(error_output, axis=0)
# gradient for the hidden layer
hidden_weights_gradient = np.matmul(X.T, error_hidden)
hidden_bias_gradient = np.sum(error_hidden, axis=0)
return output_weights_gradient, output_bias_gradient, hidden_weights_gradient, hidden_bias_gradient
print("Old accuracy on training data: " + str(accuracy_score(predict(X_train), Y_train)))
eta = 0.01
lmbd = 0.01
for i in range(1000):
# calculate gradients
dWo, dBo, dWh, dBh = backpropagation(X_train, Y_train_onehot)
# regularization term gradients
dWo += lmbd * output_weights
dWh += lmbd * hidden_weights
# update weights and biases
output_weights -= eta * dWo
output_bias -= eta * dBo
hidden_weights -= eta * dWh
hidden_bias -= eta * dBh
print("New accuracy on training data: " + str(accuracy_score(predict(X_train), Y_train)))