Skip to content

Vanilla Neural Network Solution


# Initialize weights and biases
W0 = torch.tensor([
    [ 0.48, -0.43],
    [-0.51, -0.48]
], requires_grad=True)
W1 = torch.tensor([
    [-0.99, 0.36, -0.75],
    [-0.66, 0.34,  0.66]
], requires_grad=True)
B0 = torch.tensor([0.23, 0.05], requires_grad=True)
B1 = torch.tensor([0.32, -0.44, 0.70], requires_grad=True)

# Calculate Yhat
X0 = X.reshape(8, 1, 2)
Z0 = X0 @ W0 + B0
A1 = 1/(1 + torch.exp(-Z0))
Z1 = A1 @ W1 + B1
Yhat = torch.exp(Z1).squeeze() / torch.exp(Z1).sum(axis=2)

# Calculate the loss
p = Yhat[torch.arange(len(y)), y]
loss = -(torch.log(p) + torch.log(1-p)).mean()

# Calculate the gradient
loss.backward()

print(W0.grad)
# tensor([[4.6497e-05, 3.3986e-02],
#         [1.3783e-02, 4.1103e-02]])

print(W1.grad)
# tensor([[-0.1231, -0.0010,  0.1241],
#         [-0.0908,  0.0039,  0.0869]])

print(B0.grad)
# tensor([0.0160, 0.0716])

print(B1.grad)
# tensor([-0.2299,  0.0075,  0.2224])

Explanation

  1. Initialize the weight matrices and bias vectors using torch.tensor().

    W0 = torch.tensor([
        [ 0.48, -0.43],
        [-0.51, -0.48]
    ], requires_grad=True)
    W1 = torch.tensor([
        [-0.99, 0.36, -0.75],
        [-0.66, 0.34,  0.66]
    ], requires_grad=True)
    B0 = torch.tensor([0.23, 0.05], requires_grad=True)
    B1 = torch.tensor([0.32, -0.44, 0.70], requires_grad=True)
    
  2. Calculate Yhat.

    We do this using a sequence of tensor operations mimicking the feed-forward process.

    First we reshape X from an (8,2) tensor into a (8,1,2) tensor so that we can perform matrix multiplication between it and W0. Then we calculate Z0, the inputs to the hidden layer activation functions.

    X0 = X.reshape(8, 1, 2)
    Z0 = X0 @ W0 + B0
    
    print(Z0)
    # tensor([[[ 0.0160, -0.2621]],
    #         [[ 0.5264, -0.2498]],
    #         [[ 0.1522, -0.5378]],
    #         [[-0.1073, -0.5379]],
    #         [[ 0.2872, -0.2772]],
    #         [[ 0.4289, -0.2932]],
    #         [[ 0.5454, -0.5301]],
    #         [[ 0.5051, -0.5628]]], grad_fn=<AddBackward0>)
    

    Then we calculate the hidden layer nodes using logistic activation functions with the help of torch.exp().

    A1 = 1/(1 + torch.exp(-Z0))
    
    print(A1)
    # tensor([[[0.5040, 0.4349]],
    #         [[0.6287, 0.4379]],
    #         [[0.5380, 0.3687]],
    #         [[0.4732, 0.3687]],
    #         [[0.5713, 0.4311]],
    #         [[0.6056, 0.4272]],
    #         [[0.6331, 0.3705]],
    #         [[0.6236, 0.3629]]], grad_fn=<MulBackward0>)
    

    Then we calculate the inputs to the softmax layer.

    Z1 = A1 @ W1 + B1
    
    print(Z1)
    # tensor([[[-0.4660, -0.1107,  0.6090]],
    #         [[-0.5914, -0.0648,  0.5175]],
    #         [[-0.4559, -0.1210,  0.5398]],
    #         [[-0.3918, -0.1443,  0.5884]],
    #         [[-0.5301, -0.0877,  0.5561]],
    #         [[-0.5615, -0.0767,  0.5278]],
    #         [[-0.5513, -0.0861,  0.4697]],
    #         [[-0.5369, -0.0921,  0.4718]]], grad_fn=<AddBackward0>)
    

    Lastly, we calculate the Yhat using softmax.

    Yhat = torch.exp(Z1).squeeze() / torch.exp(Z1).sum(axis=2)
    
    print(Yhat)
    # tensor([[0.1867, 0.2663, 0.5470],
    #         [0.1747, 0.2958, 0.5295],
    #         [0.1959, 0.2738, 0.5303],
    #         [0.2022, 0.2590, 0.5388],
    #         [0.1812, 0.2820, 0.5368],
    #         [0.1787, 0.2902, 0.5311],
    #         [0.1863, 0.2966, 0.5171],
    #         [0.1886, 0.2943, 0.5171]], grad_fn=<DivBackward0>)
    

    squeeze()

    Note that we use squeeze() to convert torch.exp(Z1) from shape (8, 1, 3) to (8, 3). squeeze() removes dimensions of size 1 from a tensor.

  3. Calculate the categorical cross entropy loss.

    First we use y to index Yhat, picking out the elements that correspond to the correct class labels.

    p = Yhat[torch.arange(len(y)), y]
    print(p)
    # tensor([0.1867, 0.5295, 0.5303, 0.2022, 0.5368, 0.5311, 0.1863, 0.2943],
    #        grad_fn=<IndexBackward0>)
    

    Then we calculate the loss as the negative mean of each instance loss.

    loss = -(torch.log(p) + torch.log(1-p)).mean()
    print(loss)
    # tensor(1.5912, grad_fn=<NegBackward0>)
    
  4. Calculate the gradient of loss with respect to the weights and biases.

    Since loss is a scalar, we can simply call loss.backward(), and PyTorch will calculate the gradients for us.

    loss.backward()
    
    print(W0.grad)
    # tensor([[4.6497e-05, 3.3986e-02],
    #         [1.3783e-02, 4.1103e-02]])
    
    print(W1.grad)
    # tensor([[-0.1231, -0.0010,  0.1241],
    #         [-0.0908,  0.0039,  0.0869]])
    
    print(B0.grad)
    # tensor([0.0160, 0.0716])
    
    print(B1.grad)
    # tensor([-0.2299,  0.0075,  0.2224])
    

    This step requires W0, W1, B0 and B1 be leaf tensors with requires_grad=True.


See the problem