Skip to content

Vanilla Neural Network Solution


# Initialize weights and biases
W0 = torch.tensor([
    [ 0.48, -0.43],
    [-0.51, -0.48]
], requires_grad=True)
W1 = torch.tensor([
    [-0.99, 0.36, -0.75],
    [-0.66, 0.34,  0.66]
], requires_grad=True)
B0 = torch.tensor([0.23, 0.05], requires_grad=True)
B1 = torch.tensor([0.32, -0.44, 0.70], requires_grad=True)

# Calculate Yhat
X0 = X.reshape(8, 1, 2)
Z0 = X0 @ W0 + B0
A1 = 1/(1 + torch.exp(Z0))
Z1 = A1 @ W1 + B1
Yhat = torch.exp(Z1).squeeze() / torch.exp(Z1).sum(axis=2)

# Calculate the loss
p = Yhat[torch.arange(len(y)), y]
loss = -(torch.log(p) + torch.log(1-p)).mean()

# Calculate the gradient
loss.backward()

print(W0.grad)
# tensor([[ 0.0080, -0.0439],
#         [-0.0117, -0.0493]])

print(W1.grad)
# tensor([[-0.1275, -0.0093,  0.1368],
#         [-0.1668, -0.0241,  0.1909]])

print(B0.grad)
# tensor([-0.0066, -0.0896])

print(B1.grad)
# tensor([-0.2760, -0.0385,  0.3145])

Explanation

  1. Initialize the weight matrices and bias vectors using torch.tensor().

    W0 = torch.tensor([
        [ 0.48, -0.43],
        [-0.51, -0.48]
    ], requires_grad=True)
    W1 = torch.tensor([
        [-0.99, 0.36, -0.75],
        [-0.66, 0.34,  0.66]
    ], requires_grad=True)
    B0 = torch.tensor([0.23, 0.05], requires_grad=True)
    B1 = torch.tensor([0.32, -0.44, 0.70], requires_grad=True)
    
  2. Calculate Yhat.

    We do this using a sequence of tensor operations mimicking the feed-forward process.

    First we reshape X from an (8,2) tensor into a (8,1,2) tensor so that we can perform matrix multiplication between it and W0. Then we calculate Z0, the inputs to the hidden layer activation functions.

    X0 = X.reshape(8, 1, 2)
    Z0 = X0 @ W0 + B0
    
    print(Z0)
    # tensor([[[ 0.0160, -0.2621]],
    #         [[ 0.5264, -0.2498]],
    #         [[ 0.1522, -0.5378]],
    #         [[-0.1073, -0.5379]],
    #         [[ 0.2872, -0.2772]],
    #         [[ 0.4289, -0.2932]],
    #         [[ 0.5454, -0.5301]],
    #         [[ 0.5051, -0.5628]]], grad_fn=<AddBackward0>)
    

    Then we calculate the hidden layer nodes using logistic activation functions with the help of torch.exp().

    A1 = 1/(1 + torch.exp(Z0))
    
    print(A1)
    # tensor([[[0.4960, 0.5651]],
    #         [[0.3713, 0.5621]],
    #         [[0.4620, 0.6313]],
    #         [[0.5268, 0.6313]],
    #         [[0.4287, 0.5689]],
    #         [[0.3944, 0.5728]],
    #         [[0.3669, 0.6295]],
    #         [[0.3764, 0.6371]]], grad_fn=<MulBackward0>)
    

    Then we calculate the inputs to the softmax layer.

    Z1 = A1 @ W1 + B1
    
    print(Z1)
    # tensor([[[-0.5440, -0.0693,  0.7010]],
    #         [[-0.4186, -0.1152,  0.7925]],
    #         [[-0.5541, -0.0590,  0.7702]],
    #         [[-0.6182, -0.0357,  0.7216]],
    #         [[-0.4799, -0.0923,  0.7539]],
    #         [[-0.4485, -0.1033,  0.7822]],
    #         [[-0.4587, -0.0939,  0.8403]],
    #         [[-0.4731, -0.0879,  0.8382]]], grad_fn=<AddBackward0>)
    

    Lastly, we calculate the Yhat using softmax.

    Yhat = torch.exp(Z1).squeeze() / torch.exp(Z1).sum(axis=2)
    
    print(Yhat)
    # tensor([[0.1645, 0.2644, 0.5712],
    #         [0.1751, 0.2371, 0.5878],
    #         [0.1563, 0.2563, 0.5874],
    #         [0.1513, 0.2709, 0.5778],
    #         [0.1693, 0.2494, 0.5813],
    #         [0.1714, 0.2420, 0.5867],
    #         [0.1638, 0.2359, 0.6003],
    #         [0.1618, 0.2378, 0.6004]], grad_fn=<DivBackward0>)
    

    squeeze()

    Note that we use squeeze() to convert torch.exp(Z1) from shape (8, 1, 3) to (8, 3). squeeze() removes dimensions of size 1 from a tensor.

  3. Calculate the categorical cross entropy loss.

    First we use y to index Yhat, picking out the elements that correspond to the correct class labels.

    p = Yhat[torch.arange(len(y)), y]
    print(p)
    # tensor([0.1645, 0.5878, 0.5874, 0.1513, 0.5813, 0.5867, 0.1638, 0.2378],
    #        grad_fn=<IndexBackward0>)
    

    Then we calculate the loss as the negative mean of each instance loss.

    loss = -(torch.log(p) + torch.log(1-p)).mean()
    print(loss)
    # tensor(1.6748, grad_fn=<NegBackward0>)
    
  4. Calculate the gradient of loss with respect to the weights and biases.

    Since loss is a scalar, we can simply call loss.backward(), and PyTorch will calculate the gradients for us.

    loss.backward()
    
    print(W0.grad)
    # tensor([[ 0.0080, -0.0439],
    #         [-0.0117, -0.0493]])
    
    print(W1.grad)
    # tensor([[-0.1275, -0.0093,  0.1368],
    #         [-0.1668, -0.0241,  0.1909]])
    
    print(B0.grad)
    # tensor([-0.0066, -0.0896])
    
    print(B1.grad)
    # tensor([-0.2760, -0.0385,  0.3145])
    

    This step requires W0, W1, B0 and B1 be leaf tensors with requires_grad=True.


See the problem