【Pytorch】 Pytorch튜토리얼 5 - LEARNING PYTORCH WITH EXAMPLES

27 Apr 2020 in Pytorch / Docker / Git

아래의 주석을 참고해서 공부하기 바랍니다.

0. 사이트 링크

1. Numpy를 이용해서 가중치 갱신해보기

numpy 연산을 사용하여 네트워크를 통해 순방향 및 역방향 패스를 수동으로 구현

# 간단한 2층 Fully connected layer를 구현한.
import numpy as np

# N is batch size;       D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)    # 64* 1000  -> 64장. 1장당 1000개의 픽셀 데이터 가짐.
y = np.random.randn(N, D_out)   # 64* 10    

# Randomly initialize weights
w1 = np.random.randn(D_in, H)   # 1000 * 100
w2 = np.random.randn(H, D_out)  # 100 * 10

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # 가중치 갱신 : Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

import torch


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)            # https://pytorch.org/docs/stable/torch.html#torch.mm
    h_relu = h.clamp(min=0) # https://pytorch.org/docs/stable/tensors.html
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    if t % 100 == 99:
        print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

3. Autograd

위에는 아주 작은 Fully connected layer이므로 backward를 저렇게 손수 구현했지만,
만약 layer가 깊고, 복잡하면 backward를 손수 구현하기 쉽지 않다.
Torch에서는 automatic differentiaion 을 제공하므로 자동으로 backward를 사용할 수 있다.
Torch에서는 computational graph도 정의해준다.

## 주석을 포함하지 않는 코드 ##
import torch

dtype = torch.float
device = torch.device("cpu")

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())

    loss.backward()

    with torch.no_grad():
        w1 -= learning_rate * (w1.grad)
        w2 -= learning_rate * (w2.grad)
        w1.grad.zero_()
        w2.grad.zero_()

## 주석을 포함하는 코드 ##
import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype) 
# d_loss / d_y를 구할 필요는 없으므로, requires_grad = True로 할 필요 없다.

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    # requires_grad=True 가 되어 있는 모든 변수(nm)에 대한, 미분 Graph를 따라, d_loss/d_nm를 구해 놓는다.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this in autograd.
    # w는 requires_grad가 되어 있기 때문에, 여기서의 연산까지도 grad track할 필요가 없다.

    # An alternative way is to operate on weight.data and weight.grad.data. 
    # 3장의 가중치 갱신 파트를 볼 것. 
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.

    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

4. 새로운 Autograd 기능 정의하기

torch.autograd.Function 를 상속하는 class를 만들어보자.
이러한 class는 새로운 [activation function]을 정의하는데 큰 도움을 준다.
아래에 relu대로 동작하는 class를 직접 만들어 보자.

## 주석을 포함하지 않는 코드 ##
import torch


class MyReLU(torch.autograd.Function):
    # staticmethod vs classmethod : http://egloos.zum.com/mcchae/v/11031012
    @staticmethod
    def forward(ctx, input): 
        ctx.save_for_backward(input)    # https://bit.ly/3aHjPoD : torch.autograd.function._ContextMethodMixin.save_for_backward
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output): # relu의 backward는 forward에서 0으로 바꾼것은 backward도 0으로. 나머지는 그대로. 이다.
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input


dtype = torch.float
device = torch.device("cpu")

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # https://pytorch.org/docs/stable/autograd.html  -> 찾기 : apply 
    # torch.autograd.Function 를 상속하는 class는 apply함수를 이용해 사용 가능하다
    # >> #Use it by calling the apply method:
    relu = MyReLU.apply

    y_pred = relu(x.mm(w1)).mm(w2)

    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())

    loss.backward()

    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        w1.grad.zero_()
        w2.grad.zero_()

## 주석을 포함하는 코드 ##
import torch


class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    'torch.autograd.Function' and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is 'a context object' that can be used
        to stash(저장해두다) information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input) # backward를 하기 위해, input을 메모리에 잠시 저장.
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply

    # Forward pass: compute predicted y using operations; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())

    loss.backward()

    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        w1.grad.zero_() 이 작업을 하지 않으면, grad가 누적된다.
        w2.grad.zero_()

5. nn 모듈

nn 패키지는 신경망 계층(layer)들과 거의 동일한 Module 의 집합을 정의합니다.
여기서 Module은 입력 Tensor를 받고 출력 Tensor를 계산하는 것은 물론,
학습 가능한 매개변수를 갖는 Tensor 같은 내부 상태(internal state)를 갖습니다.

import torch

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# nn 패키지를 사용하여 모델을 순차적 계층(sequence of layers)으로 정의합니다.
# nn.Sequential은 다른 Module들을 포함하는 Module로, 그 Module들을 순차적으로
# 입력으로부터 출력을 계산하고, 내부 Tensor에 가중치와 편향을 저장합니다.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

# 또한 nn 패키지에는 널리 사용하는 손실 함수들에 대한 정의도 포함하고 있습니다;
# 여기에서는 평균 제곱 오차(MSE; Mean Squared Error)를 손실 함수로 사용하겠습니다.
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
for t in range(500):
    # 1. forward pass 하기
    # 순전파 단계: 모델에 x를 전달하여 예상되는 y 값을 계산합니다. Module 객체는
    # __call__ 연산자를 덮어써(override) 함수처럼 호출할 수 있게 합니다.
    # 이렇게 함으로써 입력 데이터의 Tensor를 Module에 전달하여 출력 데이터의
    # Tensor를 생성합니다.
    y_pred = model(x)

    # 2. Loss값 계산하기
    # 손실을 계산하고 출력합니다. 예측한 y와 정답인 y를 갖는 Tensor들을 전달하고,
    # 손실 함수는 손실 값을 갖는 Tensor를 반환합니다.
    loss = loss_fn(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())

    # 3. modul 내부의 grad를 0으로 모두 초기화 하고, Backpropagation을 시행한다.
    model.zero_grad()
    # 역전파 단계: 모델의 학습 가능한 모든 매개변수에 대해 손실의 변화도를
    # 계산합니다. 내부적으로 각 Module의 매개변수는 requires_grad=True 일 때
    # Tensor 내에 저장되므로, 이 호출은 모든 모델의 모든 학습 가능한 매개변수의
    # 변화도를 계산하게 됩니다. 
    # x와 y는 계산할 필요가 없으므로, requires_grad=True로 해주지 않아도 됐다. 
    # 가중치와 편향은 자동으로 requires_grad=True로 정의된다.
    loss.backward()

    # 4. Backpropagation한 값을 이용해서, 가중치를 갱신한다.
    # 경사하강법(gradient descent)를 사용하여 가중치를 갱신합니다. 각 매개변수는
    # Tensor이므로 이전에 했던 것과 같이 변화도에 접근할 수 있습니다.
    with torch.no_grad():
        for param in model.parameters(): # 여기서 사용한 메소드. 눈 여겨보기
            param -= learning_rate * param.grad

6. Optimizer

지금까지는 모델의 가중치를 직접 갱신했다. {Ex. param -= learning_rate * param.grad} 하지만 우리는 그렇게 할 필요가 없다.

import torch

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(reduction='sum')

# optim 패키지를 사용하여 모델의 가중치를 갱신할 Optimizer를 정의합니다.
# Adam 생성자의 첫번째 인자는 어떤 Tensor가 갱신되어야 하는지 알려줍니다.
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for t in range(500):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())

    # 역전파 단계 전에, Optimizer 객체를 사용하여 (모델의 학습 가능한 가중치인)
    # 갱신할 변수들에 대한 모든 변화도를 0으로 만듭니다. 이렇게 하는 이유는
    # 기본적으로 .backward()를 호출할 때마다 변화도가 메모리 버퍼(buffer)에 (덮어쓰지 않고)
    # 누적되기 때문입니다. 더 자세한 내용은 torch.autograd.backward에 대한 문서를 참조하세요.
    # 따라서 위의 방법이었던, model.zero_grad()를 사용하는게 아닌.. 
    optimizer.zero_grad()

    # 역전파 단계: 모델의 매개변수에 대한 손실의 변화도를 계산합니다.
    loss.backward()

    # Optimizer의 step 함수를 호출하면 매개변수가 갱신됩니다.
    optimizer.step()

7. nn.Module 적극적으로 이용하기

지금까지 model = torch.nn.Sequential를 사용해서 model을 정의했다.
하지만 이렇게 하지 않고 아래와 같은 방법을 사용하는게 복잡한 모델 구성할 때 이롭다.

이 코드를 통채로 외우자!!!!!!!!!!!!!

# 이 코드를 통채로 외우자!!!!!!!!!!!!!
import torch


class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)
        # self.relu1 = torch.nn.ReLU()

    def forward(self, x):
        # self.relu1(self.linear1(x))
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = TwoLayerNet(D_in, H, D_out)
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)

for t in range(500):
    y_pred = model(x)
    loss = criterion(y_pred, y)

    if t % 100 == 99:
        print(t, loss.item())

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

위와 같이 정직한 Modul을 사용하는게 아닌, 복잡한 모델을 생성해야할 수 있다.
이를 위해서 우리는 아래와 같은 방법을 사용해서 모델을 정의할 수 있다.
여기서는 2번째 Layer의 통과를 0 ~ 3개의 subLayer를 통과하도록 만든다. 가장 안쪽(innermost)의 은닉층들을 계산하기 위해 동일한 가중치를 여러 번 재사용하는 코드를 만들어보자.

import random
import torch


class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(DynamicNet, self).__init__()  
        self.input_linear = torch.nn.Linear(D_in, H)        #1 Layer
        self.middle_linear = torch.nn.Linear(H, H)          #2 Layer : 위 코드와 다르게 이것 추가.
        self.output_linear = torch.nn.Linear(H, D_out)      #3 Layer

    def forward(self, x):
        """
        모델의 순전파 단계를 정의할 때 반복문이나 조건문과 같은 
        일반적인 Python 제어 흐름 연산자를 사용할 수 있습니다.

        여기에서 연산 그래프를 정의할 때 동일 Module을 여러번 재사용하는 것이
        완벽히 안전하다는 것을 알 수 있습니다. 
        """
        h_relu = self.input_linear(x).clamp(min=0)          # 1Layer + 1Relu 통과
        for _ in range(random.randint(0, 3)):               # 0 ~ 3 번.2 Layer + 1Relu 를 통과
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)                 # 3Layer를 통과시킨다
        return y_pred

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = DynamicNet(D_in, H, D_out)
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)

for t in range(500):
    y_pred = model(x)
    loss = criterion(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()